From f0e6c91da6e62e431b85c402707a259c9558970c Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 8 Feb 2024 11:22:25 -0700 Subject: [PATCH 001/243] Separate out how commands are ran This is needed so that we can create a Runner for PBS/slurm refs #0 --- python/TestHarness/runners/Runner.py | 70 +++++++++++ .../TestHarness/runners/SubprocessRunner.py | 101 ++++++++++++++++ python/TestHarness/schedulers/Job.py | 4 +- python/TestHarness/schedulers/RunParallel.py | 11 ++ python/TestHarness/testers/AnalyzeJacobian.py | 4 +- python/TestHarness/testers/RunApp.py | 8 +- python/TestHarness/testers/RunCommand.py | 6 +- python/TestHarness/testers/SignalTester.py | 2 +- python/TestHarness/testers/Tester.py | 112 +++++------------- 9 files changed, 223 insertions(+), 95 deletions(-) create mode 100644 python/TestHarness/runners/Runner.py create mode 100644 python/TestHarness/runners/SubprocessRunner.py diff --git a/python/TestHarness/runners/Runner.py b/python/TestHarness/runners/Runner.py new file mode 100644 index 000000000000..d184695172f7 --- /dev/null +++ b/python/TestHarness/runners/Runner.py @@ -0,0 +1,70 @@ +#* This file is part of the MOOSE framework +#* https://www.mooseframework.org +#* +#* All rights reserved, see COPYRIGHT for full restrictions +#* https://github.com/idaholab/moose/blob/master/COPYRIGHT +#* +#* Licensed under LGPL 2.1, please see LICENSE for details +#* https://www.gnu.org/licenses/lgpl-2.1.html + +class Runner: + """ + Base class for running a process via a command. + + Used within the Tester to actually run a test. We need + this specialized so that we can either run things locally + or externally (i.e., PBS, slurm, etc on HPC) + """ + def __init__(self, tester): + # The tester that we're going to run + self.tester = tester + # The job's exit code, should be set after wait() + self.exit_code = None + + def spawn(self, cmd, cwd, timer): + """ + Spawns the process. + + Wait for the process to complete with wait(). + + Should be overridden. + """ + pass + + def wait(self, timer): + """ + Waits for the process started with spawn() to complete. + + Should be overridden. + """ + pass + + def kill(self): + """ + Kills the process started with spawn() + + Should be overridden. + """ + pass + + def getOutput(self): + """ + Gets the combined output of the process. + + Should be overridden. + """ + return None + + def getExitCode(self): + """ + Gets the error code of the process. + """ + return self.exit_code + + def isOutputReady(self): + """ + Whether or not the output is ready for reading. + + Should be overridden. + """ + return None diff --git a/python/TestHarness/runners/SubprocessRunner.py b/python/TestHarness/runners/SubprocessRunner.py new file mode 100644 index 000000000000..c916d37c0222 --- /dev/null +++ b/python/TestHarness/runners/SubprocessRunner.py @@ -0,0 +1,101 @@ +import os +import platform +import subprocess +import shlex +import traceback +from tempfile import SpooledTemporaryFile +from signal import SIGTERM +from TestHarness.runners.Runner import Runner +from TestHarness import util + +class SubprocessRunner(Runner): + """ + Runner that spawns a local subprocess. + """ + def __init__(self, tester): + Runner.__init__(self, tester) + + # The output file handler + self.outfile = None + # The error file handler + self.errfile = None + # The underlying subprocess + self.process = None + + def spawn(self, cmd, cwd, timer): + use_shell = self.tester.specs["use_shell"] + + # Split command into list of args to be passed to Popen + if not use_shell: + cmd = shlex.split(cmd) + + self.process = None + try: + self.outfile = SpooledTemporaryFile(max_size=1000000) # 1M character buffer + self.errfile = SpooledTemporaryFile(max_size=100000) # 100K character buffer + + process_args = [cmd] + process_kwargs = {'stdout': self.outfile, + 'stderr': self.errfile, + 'close_fds': False, + 'shell': use_shell, + 'cwd': cwd} + # On Windows, there is an issue with path translation when the command is passed in + # as a list. + if platform.system() == "Windows": + process_kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP + else: + process_kwargs['preexec_fn'] = os.setsid + + # Special logic for openmpi runs + if self.hasOpenMPI(): + popen_env = os.environ.copy() + + # Don't clobber state + popen_env['OMPI_MCA_orte_tmpdir_base'] = self.getTempDirectory().name + # Allow oversubscription for hosts that don't have a hostfile + popen_env['PRTE_MCA_rmaps_default_mapping_policy'] = ':oversubscribe' + + popen_kwargs['env'] = popen_env + + self.process = subprocess.Popen(*process_args, **process_kwargs) + except: + print("Error in launching a new task", cmd) + traceback.print_exc() + raise + + timer.start() + + def wait(self, timer): + self.process.wait() + + timer.stop() + + self.exit_code = self.process.poll() + self.outfile.flush() + self.errfile.flush() + + # store the contents of output, and close the file + self.outfile.close() + self.errfile.close() + + def kill(self): + if self.process is not None: + try: + if platform.system() == "Windows": + from distutils import spawn + if spawn.find_executable("taskkill"): + subprocess.call(['taskkill', '/F', '/T', '/PID', str(self.process.pid)]) + else: + self.process.terminate() + else: + pgid = os.getpgid(self.process.pid) + os.killpg(pgid, SIGTERM) + except OSError: # Process already terminated + pass + + def getOutput(self): + return util.readOutput(self.outfile, self.errfile, self) + + def isOutputReady(self): + return not self.outfile is None and self.outfile.closed and not self.errfile is None and self.errfile.closed diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 9f5deccfff0a..d8fb25d5a4a3 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -51,7 +51,6 @@ def __init__(self, tester, job_dag, options): self.specs = tester.specs self.__job_dag = job_dag self.timer = Timer() - self.__outfile = None self.__start_time = clock() self.__end_time = None self.__previous_time = None @@ -272,8 +271,7 @@ def getOutputFile(self): def setOutput(self, output): """ Method to allow schedulers to overwrite the output if certain conditions are met """ - if (not self.__tester.outfile is None and not self.__tester.outfile.closed - and not self.__tester.errfile is None and not self.__tester.errfile.closed): + if not self.__tester.isOutputReady(): return # Check for invalid unicode in output diff --git a/python/TestHarness/schedulers/RunParallel.py b/python/TestHarness/schedulers/RunParallel.py index 98e4d062eef9..d59b80a34867 100644 --- a/python/TestHarness/schedulers/RunParallel.py +++ b/python/TestHarness/schedulers/RunParallel.py @@ -12,6 +12,8 @@ from TestHarness.schedulers.Scheduler import Scheduler from TestHarness.StatusSystem import StatusSystem from TestHarness import util +from TestHarness.runners.SubprocessRunner import Runner, SubprocessRunner +from TestHarness.testers.Tester import Tester class RunParallel(Scheduler): """ @@ -30,6 +32,7 @@ def run(self, job): """ Run a tester command """ tester = job.getTester() + tester.setRunner(self.buildRunner(tester)) # Do not execute app, and do not processResults if self.options.dry_run: @@ -93,6 +96,14 @@ def run(self, job): # Set testers output with modifications made above so it prints the way we want it job.setOutput(output) + def buildRunner(self, tester: Tester) -> Runner: + """Builds the runner for a given tester + + This exists as a method so that derived schedulers can change how they + run commands (i.e., for PBS and slurm) + """ + return SubprocessRunner(tester) + def setSuccessfulMessage(self, tester): """ properly set a finished successful message for tester """ message = '' diff --git a/python/TestHarness/testers/AnalyzeJacobian.py b/python/TestHarness/testers/AnalyzeJacobian.py index 38a4585a159d..70ef3b9cf2a3 100644 --- a/python/TestHarness/testers/AnalyzeJacobian.py +++ b/python/TestHarness/testers/AnalyzeJacobian.py @@ -76,12 +76,12 @@ def processResults(self, moose_dir, options, output): specs = self.specs if specs.isValid('expect_out'): out_ok = util.checkOutputForPattern(output, specs['expect_out']) - if (out_ok and self.exit_code != 0): + if (out_ok and self.getExitCode() != 0): reason = 'OUT FOUND BUT CRASH' elif (not out_ok): reason = 'NO EXPECTED OUT' if reason == '': - if self.exit_code != 0 : + if self.getExitCode() != 0 : reason = 'CRASH' if reason != '': diff --git a/python/TestHarness/testers/RunApp.py b/python/TestHarness/testers/RunApp.py index 25c325eb0d93..ceff7a48361f 100644 --- a/python/TestHarness/testers/RunApp.py +++ b/python/TestHarness/testers/RunApp.py @@ -314,18 +314,18 @@ def testExitCodes(self, moose_dir, options, output): # in the derived class. if options.valgrind_mode == '' and not specs.isValid('expect_err') and len( [x for x in filter( lambda x: x in output, specs['errors'] )] ) > 0: reason = 'ERRMSG' - elif self.exit_code == 0 and specs['should_crash'] == True: + elif self.getExitCode() == 0 and specs['should_crash'] == True: reason = 'NO CRASH' - elif self.exit_code != 0 and specs['should_crash'] == False: + elif self.getExitCode() != 0 and specs['should_crash'] == False: # Let's look at the error code to see if we can perhaps further split this out later with a post exam reason = 'CRASH' # Valgrind runs - elif self.exit_code == 0 and self.shouldExecute() and options.valgrind_mode != '' and 'ERROR SUMMARY: 0 errors' not in output: + elif self.getExitCode() == 0 and self.shouldExecute() and options.valgrind_mode != '' and 'ERROR SUMMARY: 0 errors' not in output: reason = 'MEMORY ERROR' if reason != '': self.setStatus(self.fail, str(reason)) - return "\n\nExit Code: " + str(self.exit_code) + return "\n\nExit Code: " + str(self.getExitCode()) # Return anything extra here that we want to tack onto the Output for when it gets printed later return '' diff --git a/python/TestHarness/testers/RunCommand.py b/python/TestHarness/testers/RunCommand.py index 74c1d879c05e..ff5e888c18b2 100644 --- a/python/TestHarness/testers/RunCommand.py +++ b/python/TestHarness/testers/RunCommand.py @@ -27,9 +27,9 @@ def getCommand(self, options): return self.command def processResults(self, moose_dir, options, output): - if self.exit_code == 77 : + if self.getExitCode() == 77 : self.setStatus(self.skip) - elif self.exit_code != 0 : - self.setStatus(self.fail, 'CODE %d' % self.exit_code) + elif self.getExitCode() != 0 : + self.setStatus(self.fail, 'CODE %d' % self.getExitCode()) return output diff --git a/python/TestHarness/testers/SignalTester.py b/python/TestHarness/testers/SignalTester.py index e26524f17713..1852308dbc7c 100644 --- a/python/TestHarness/testers/SignalTester.py +++ b/python/TestHarness/testers/SignalTester.py @@ -72,7 +72,7 @@ def runCommand(self, timer, options): in the tester's output and exit_code fields. """ - exit_code = super().spawnSubprocessFromOptions(timer, options) + exit_code = super().spawnProcessFromOptions(timer, options) if exit_code: # Something went wrong return diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index 07b018d5ff84..9fc733bd4ba5 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -11,6 +11,7 @@ import mooseutils from TestHarness import util from TestHarness.StatusSystem import StatusSystem +from TestHarness.runners.Runner import Runner from FactorySystem.MooseObject import MooseObject from tempfile import SpooledTemporaryFile, TemporaryDirectory from pathlib import Path @@ -127,10 +128,7 @@ def __del__(self): def __init__(self, name, params): MooseObject.__init__(self, name, params) self.specs = params - self.outfile = None - self.errfile = None self.joined_out = '' - self.exit_code = 0 self.process = None self.tags = params['tags'] self.__caveats = set([]) @@ -166,6 +164,9 @@ def __init__(self, name, params): # A temp directory for this Tester, if requested self.tmp_dir = None + # The object that'll actually do the run + self._runner = None + def getTempDirectory(self): """ Gets a shared temp directory that will be cleaned up for this Tester @@ -188,6 +189,22 @@ def cleanup(self): pass self.tmp_dir = None + def setRunner(self, runner: Runner): + """Sets the underlying Runner object that will run the command""" + self._runner = runner + + def getOutput(self) -> str: + """Return the combined contents of stdout and stderr of the command ran""" + return self._runner.getOutput() + + def isOutputReady(self) -> bool: + """Returns whether or not the output is ready for reading""" + return self._runner.isOutputReady() + + def getExitCode(self) -> int: + """Gets the exit code of the command that was ran""" + return self._runner.getExitCode() + def getStatus(self): return self.test_status.getStatus() @@ -285,10 +302,6 @@ def getOutputFiles(self): """ return the output files if applicable to this Tester """ return [] - def getOutput(self): - """ Return the contents of stdout and stderr """ - return self.joined_out - def getCheckInput(self): return self.check_input @@ -359,19 +372,12 @@ def hasOpenMPI(self): return False return Path(which_mpiexec).parent.absolute() == Path(which_ompi_info).parent.absolute() - def spawnSubprocessFromOptions(self, timer, options): + def spawnProcessFromOptions(self, timer, options): """ - Spawns a subprocess based on given options, sets output and error files, + Spawns a process based on given options, sets output and error files, and starts timer. """ cmd = self.getCommand(options) - - use_shell = self.specs["use_shell"] - - if not use_shell: - # Split command into list of args to be passed to Popen - cmd = shlex.split(cmd) - cwd = self.getTestDir() # Verify that the working directory is available right before we execute. @@ -382,64 +388,18 @@ def spawnSubprocessFromOptions(self, timer, options): timer.stop() return 1 - self.process = None - try: - f = SpooledTemporaryFile(max_size=1000000) # 1M character buffer - e = SpooledTemporaryFile(max_size=100000) # 100K character buffer - - popen_args = [cmd] - popen_kwargs = {'stdout': f, - 'stderr': e, - 'close_fds': False, - 'shell': use_shell, - 'cwd': cwd} - # On Windows, there is an issue with path translation when the command - # is passed in as a list. - if platform.system() == "Windows": - popen_kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP - else: - popen_kwargs['preexec_fn'] = os.setsid + # Spawn the process + self._runner.spawn(cmd, cwd, timer) - # Special logic for openmpi runs - if self.hasOpenMPI(): - popen_env = os.environ.copy() - - # Don't clobber state - popen_env['OMPI_MCA_orte_tmpdir_base'] = self.getTempDirectory().name - # Allow oversubscription for hosts that don't have a hostfile - popen_env['PRTE_MCA_rmaps_default_mapping_policy'] = ':oversubscribe' - - popen_kwargs['env'] = popen_env - - process = subprocess.Popen(*popen_args, **popen_kwargs) - except: - print("Error in launching a new task", cmd) - raise - - self.process = process - self.outfile = f - self.errfile = e - - timer.start() return 0 - def finishAndCleanupSubprocess(self, timer): + def finishAndCleanupProcess(self, timer): """ - Waits for the current subproccess to finish, stops the timer, and + Waits for the current process to finish, stops the timer, and cleans up. """ - self.process.wait() - timer.stop() - - self.exit_code = self.process.poll() - self.outfile.flush() - self.errfile.flush() - - # store the contents of output, and close the file - self.joined_out = util.readOutput(self.outfile, self.errfile, self) - self.outfile.close() - self.errfile.close() + self._runner.wait(timer) def runCommand(self, timer, options): """ @@ -449,29 +409,17 @@ def runCommand(self, timer, options): in the tester's output and exit_code fields. """ - exit_code = self.spawnSubprocessFromOptions(timer, options) + exit_code = self.spawnProcessFromOptions(timer, options) if exit_code: # Something went wrong return - self.finishAndCleanupSubprocess(timer) + self.finishAndCleanupProcess(timer) def killCommand(self): """ Kills any currently executing process started by the runCommand method. """ - if self.process is not None: - try: - if platform.system() == "Windows": - from distutils import spawn - if spawn.find_executable("taskkill"): - subprocess.call(['taskkill', '/F', '/T', '/PID', str(self.process.pid)]) - else: - self.process.terminate() - else: - pgid = os.getpgid(self.process.pid) - os.killpg(pgid, SIGTERM) - except OSError: # Process already terminated - pass + return self._runner.kill() # Try to clean up anything else that we can self.cleanup() From e9beea374cb65b60e3f39db9661f389737043708 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 7 May 2024 09:05:00 -0600 Subject: [PATCH 002/243] Transition PBS exeuction to RunParallel refs #27562 --- .gitignore | 4 + python/TestHarness/JobDAG.py | 2 +- python/TestHarness/TestHarness.py | 50 +- python/TestHarness/runners/PBSRunner.py | 125 +++++ python/TestHarness/runners/Runner.py | 24 +- .../TestHarness/runners/SubprocessRunner.py | 114 ++-- python/TestHarness/schedulers/Job.py | 23 +- python/TestHarness/schedulers/QueueManager.py | 446 --------------- python/TestHarness/schedulers/RunPBS.py | 526 +++++++++++------- python/TestHarness/schedulers/RunParallel.py | 11 +- python/TestHarness/schedulers/Scheduler.py | 89 ++- .../schedulers/pbs_source_apptainer | 1 + python/TestHarness/schedulers/pbs_template | 53 +- python/TestHarness/testers/AnalyzeJacobian.py | 13 +- python/TestHarness/testers/CSVDiff.py | 2 +- python/TestHarness/testers/CheckFiles.py | 11 +- python/TestHarness/testers/Exodiff.py | 2 +- python/TestHarness/testers/FileTester.py | 2 +- python/TestHarness/testers/ImageDiff.py | 2 +- python/TestHarness/testers/PythonUnitTest.py | 3 +- python/TestHarness/testers/RunApp.py | 20 +- python/TestHarness/testers/RunCommand.py | 2 +- python/TestHarness/testers/RunException.py | 13 +- python/TestHarness/testers/SignalTester.py | 46 +- python/TestHarness/testers/Tester.py | 113 ++-- python/TestHarness/testers/bench.py | 6 +- python/TestHarness/util.py | 2 +- 27 files changed, 817 insertions(+), 888 deletions(-) create mode 100644 python/TestHarness/runners/PBSRunner.py delete mode 100644 python/TestHarness/schedulers/QueueManager.py create mode 100644 python/TestHarness/schedulers/pbs_source_apptainer diff --git a/.gitignore b/.gitignore index 86df037c5f3e..bea0d73b9183 100644 --- a/.gitignore +++ b/.gitignore @@ -326,3 +326,7 @@ share/ /modules/misc/misc.yaml /tutorials/tutorial01_app_development/*/babbler.yaml /tutorials/darcy_thermo_mech/*/darcy_thermo_mech.yaml + +# test harness pbs output +pbs_*.qsub +pbs_*.out diff --git a/python/TestHarness/JobDAG.py b/python/TestHarness/JobDAG.py index 199cd512441a..be92e8af4df4 100644 --- a/python/TestHarness/JobDAG.py +++ b/python/TestHarness/JobDAG.py @@ -225,7 +225,7 @@ def _doRaceConditions(self): output_to_job = {} for job in self.__job_dag.topological_sort(): if job.getRunnable() and not job.isFinished(): - for output_file in job.getOutputFiles(): + for output_file in job.getOutputFiles(self.options): output_to_job[output_file] = output_to_job.get(output_file, []) output_to_job[output_file].append(job) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 6e4cfd85bd4e..4bb442edf4ee 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -429,7 +429,6 @@ def findAndRunTests(self, find_only=False): # Create the testers for this test testers = self.createTesters(dirpath, file, find_only, testroot_params) - # Schedule the testers (non blocking) self.scheduler.schedule(testers) @@ -525,8 +524,8 @@ def augmentParameters(self, filename, tester, testroot_params={}): params['spec_file'] = filename params['test_name'] = formatted_name + params['test_name_short'] = relative_hitpath params['test_dir'] = test_dir - params['relative_path'] = relative_path params['executable'] = testroot_params.get("executable", self.executable) params['app_name'] = self.app_name params['hostname'] = self.host_name @@ -601,7 +600,15 @@ def printOutput(self, job, color): output = '' # Print what ever status the tester has at the time if self.options.verbose or (job.isFail() and not self.options.quiet): - output = 'Working Directory: ' + job.getTestDir() + '\nRunning command: ' + job.getCommand() + '\n' + if job.getCommandRan(): + command = job.getCommandRan() + else: + cmd, mpi_cmd = job.getCommand() + command = '' + if mpi_cmd: + command += f'{mpi_cmd} ' + command += cmd + output = 'Working Directory: ' + job.getTestDir() + '\nRunning command: ' + command + '\n' output += util.trimOutput(job, self.options) output = output.replace('\r', '\n') # replace the carriage returns with newlines lines = output.split('\n') @@ -648,17 +655,11 @@ def handleJobStatus(self, job): # Just print current status without saving results else: - print((util.formatResult(job, self.options, result='RUNNING', caveats=False))) + # TODO: changed this caveats=True + print((util.formatResult(job, self.options, result=job.getStatus().status, caveats=True))) # Print final results, close open files, and exit with the correct error code def cleanup(self): - if self.options.queue_cleanup and self.options.results_file: - try: - os.remove(self.options.results_file) - except OSError: - pass - return - # Print the results table again if a bunch of output was spewed to the screen between # tests as they were running if len(self.parse_errors) > 0: @@ -681,10 +682,6 @@ def cleanup(self): fatal_error += ', FATAL PARSER ERROR' self.error_code = self.error_code | 0x80 - # Alert the user to their session file - if self.options.queueing and not self.options.dry_run: - print(('Your session file is %s' % self.options.results_file)) - # Print a different footer when performing a dry run if self.options.dry_run: print(('Processed %d tests in %.1f seconds.' % (self.num_passed+self.num_skipped, time))) @@ -806,10 +803,6 @@ def writeResults(self): # Write some useful data to our results_storage for job_group in all_jobs: for job in job_group: - # If queueing, do not store silent results in session file - if job.isSilent() and self.options.queueing: - continue - status, message, message_color, status_code, sort_value = job.getJointStatus() # Create empty key based on TestDir, or re-inialize with existing data so we can append to it @@ -905,17 +898,8 @@ def initialize(self, argv, app_name): plugin_paths = [os.path.join(self.moose_dir, 'python', 'TestHarness'), os.path.join(self.moose_dir, 'share', 'moose', 'python', 'TestHarness')] self.factory.loadPlugins(plugin_paths, 'schedulers', "IS_SCHEDULER") - self.options.queueing = False if self.options.pbs: - # original_storage will become the results file for each test being launched by PBS, and will be - # saved in the same directory as the test spec file. This is so we can launch multiple 'run_tests' - # without clobbering the parent results_file. Meanwhile, the new results_file is going to be - # renamed to whatever the user decided to identify their PBS launch with. - self.original_storage = self.options.results_file - self.options.results_file = os.path.abspath(self.options.pbs) - self.options.queueing = True scheduler_plugin = 'RunPBS' - # The default scheduler plugin else: scheduler_plugin = 'RunParallel' @@ -1073,12 +1057,11 @@ def parseCLArgs(self, argv): outputgroup.add_argument("--show-last-run", action="store_true", dest="show_last_run", help="Display previous results without executing tests again") queuegroup = parser.add_argument_group('Queue Options', 'Options controlling which queue manager to use') - queuegroup.add_argument('--pbs', nargs=1, action='store', metavar='name', help='Launch tests using PBS as your scheduler. You must supply a name to identify this session with') + queuegroup.add_argument('--pbs', action='store_true', dest='pbs', help='Launch tests using PBS as your scheduler') queuegroup.add_argument('--pbs-pre-source', nargs=1, action="store", dest='queue_source_command', metavar='', help='Source specified file before launching tests') queuegroup.add_argument('--pbs-project', nargs=1, action='store', dest='queue_project', type=str, default='moose', metavar='', help='Identify your job(s) with this project (default: %(default)s)') queuegroup.add_argument('--pbs-queue', nargs=1, action='store', dest='queue_queue', type=str, metavar='', help='Submit jobs to the specified queue') - queuegroup.add_argument('--pbs-node-cpus', nargs=1, action='store', type=int, default=None, metavar='', help='CPUS Per Node. The default (no setting), will always use only one node') - queuegroup.add_argument('--pbs-cleanup', nargs=1, action="store", dest='queue_cleanup', metavar='name', help='Clean up files generated by supplied --pbs name') + queuegroup.add_argument('--pbs-host', nargs=1, action='store', dest='queue_host', metavar='', help='The PBS host to use for submitting jobs') code = True if self.code.decode() in argv: @@ -1117,9 +1100,6 @@ def checkAndUpdateCLArgs(self): if opts.spec_file and not os.path.exists(opts.spec_file): print('ERROR: --spec-file supplied but path does not exist') sys.exit(1) - if opts.queue_cleanup and not opts.pbs: - print('ERROR: --queue-cleanup cannot be used without additional queue options') - sys.exit(1) if opts.queue_source_command and not os.path.exists(opts.queue_source_command): print('ERROR: pre-source supplied but path does not exist') sys.exit(1) @@ -1145,8 +1125,6 @@ def checkAndUpdateCLArgs(self): # other spec files. They only know about the jobs a single spec file generates. # NOTE: Which means, tests and speedtests running simultaneously currently have a chance to # clobber each others output during normal operation!? - if opts.pbs and not opts.input_file_name: - self.options.input_file_name = 'tests' # Update any keys from the environment as necessary if not self.options.method: diff --git a/python/TestHarness/runners/PBSRunner.py b/python/TestHarness/runners/PBSRunner.py new file mode 100644 index 000000000000..dd917647bee6 --- /dev/null +++ b/python/TestHarness/runners/PBSRunner.py @@ -0,0 +1,125 @@ +#* This file is part of the MOOSE framework +#* https://www.mooseframework.org +#* +#* All rights reserved, see COPYRIGHT for full restrictions +#* https://github.com/idaholab/moose/blob/master/COPYRIGHT +#* +#* Licensed under LGPL 2.1, please see LICENSE for details +#* https://www.gnu.org/licenses/lgpl-2.1.html + +from TestHarness.runners.Runner import Runner +import time, os + +class PBSRunner(Runner): + """Runner that spawns a process with PBS. + + To be used with the RunPBS scheduler. + """ + def __init__(self, job, options, run_pbs): + Runner.__init__(self, job, options) + self.run_pbs = run_pbs + + # Number of seconds to try to wait for the output + # We don't want to wait forever for output because + # if the job ended in an unexpected state, it might + # not even be using the output and we don't want to + # just hang forever + self.wait_output_time = 60 + + def spawn(self, timer): + from TestHarness.schedulers.RunPBS import RunPBS + + # Submit the job + self.run_pbs.submitJob(self.job) + + timer.start() + + def wait(self, timer): + # Need to import here to avoid cyclic includes + from TestHarness.schedulers.RunPBS import RunPBS + + # Poll loop waiting for the job to be finished + # This gets a structure that represents the job, and the + # polling itself is only done on occasion within RunPBS + while True: + time.sleep(1) + pbs_job = self.run_pbs.getPBSJob(self.job) + + # We're done + if pbs_job.done: + self.exit_code = pbs_job.exit_code + break + + timer.stop() + + # The PBS output (stdout+stderr) + output_file = self.run_pbs.getPBSJobOutputPath(self.job) + + # If the Job is already finished, something happened in PBS + # so we have an invalid state for processing in the Tester + if self.job.isFinished(): + self.exit_code = -1 + self.output = '' + + # If we have output, we should try to add it + # TODO: shorten output as an option? + if os.path.exists(output_file) and os.path.isfile(output_file): + try: + self.output = open(file, 'r').read() + except: + pass + + # Don't bother looking for the rest of the output + return + + tester = self.job.getTester() + + # We've actually ran something now and not just qsub, so update the + # command to what was ran there + tester.setCommandRan(pbs_job.command) + + # Determine the output files that we need to wait for to be complete + wait_files = set([output_file]) + # Output files needed by the Tester, only if it says we should + if tester.mustOutputExist(): + for file in tester.getOutputFiles(self.options): + wait_files.add(os.path.join(tester.getTestDir(), file)) + + # Wait for all of the files to be available + file_poll_interval = 0.5 + waited_time = 0 + while wait_files: + # Look for each file + for file in wait_files.copy(): + # File exists + if os.path.exists(file) and os.path.isfile(file): + # Special case for stdout/stderr, where we append + # something to the end to show that it's actually done + # and then need to read it + # TODO: shorten output as an option? + if file == output_file: + output = open(file, 'r').read() + ending_comment = self.run_pbs.getOutputEndingComment() + if ending_comment in output: + self.output = output.replace(ending_comment, '') + else: + continue + # Done with this file + wait_files.discard(file) + + # We've waited for files for too long + if wait_files and waited_time >= self.wait_output_time: + self.job.setStatus(self.job.error, 'FILE TIMEOUT') + if not self.output: + self.output = '' + self.output += '#' * 80 + '\nUnavailable output file(s)\n' + '#' * 80 + '\n' + for file in wait_files: + self.output += file + '\n' + self.output += '\n' + break + + waited_time += file_poll_interval + time.sleep(file_poll_interval) + + def kill(self): + self.run_pbs.killJob(self.job) diff --git a/python/TestHarness/runners/Runner.py b/python/TestHarness/runners/Runner.py index d184695172f7..a93c8b4db27e 100644 --- a/python/TestHarness/runners/Runner.py +++ b/python/TestHarness/runners/Runner.py @@ -15,13 +15,17 @@ class Runner: this specialized so that we can either run things locally or externally (i.e., PBS, slurm, etc on HPC) """ - def __init__(self, tester): - # The tester that we're going to run - self.tester = tester + def __init__(self, job, options): + # The job that this runner is for + self.job = job + # The test harness options + self.options = options # The job's exit code, should be set after wait() self.exit_code = None + # The output the job produced; to be filled in wait() + self.output = None - def spawn(self, cmd, cwd, timer): + def spawn(self, timer): """ Spawns the process. @@ -53,7 +57,7 @@ def getOutput(self): Should be overridden. """ - return None + return self.output def getExitCode(self): """ @@ -64,7 +68,13 @@ def getExitCode(self): def isOutputReady(self): """ Whether or not the output is ready for reading. + """ + return self.output is not None - Should be overridden. + def sendSignal(self, signal): + """ + Sends a signal to the process. + + Can be overridden. """ - return None + raise Exception('sendSignal not supported for this Runner') diff --git a/python/TestHarness/runners/SubprocessRunner.py b/python/TestHarness/runners/SubprocessRunner.py index c916d37c0222..43b78b698159 100644 --- a/python/TestHarness/runners/SubprocessRunner.py +++ b/python/TestHarness/runners/SubprocessRunner.py @@ -1,8 +1,13 @@ -import os -import platform -import subprocess -import shlex -import traceback +#* This file is part of the MOOSE framework +#* https://www.mooseframework.org +#* +#* All rights reserved, see COPYRIGHT for full restrictions +#* https://github.com/idaholab/moose/blob/master/COPYRIGHT +#* +#* Licensed under LGPL 2.1, please see LICENSE for details +#* https://www.gnu.org/licenses/lgpl-2.1.html + +import os, platform, subprocess, shlex, time from tempfile import SpooledTemporaryFile from signal import SIGTERM from TestHarness.runners.Runner import Runner @@ -12,8 +17,8 @@ class SubprocessRunner(Runner): """ Runner that spawns a local subprocess. """ - def __init__(self, tester): - Runner.__init__(self, tester) + def __init__(self, job, options): + Runner.__init__(self, job, options) # The output file handler self.outfile = None @@ -21,48 +26,53 @@ def __init__(self, tester): self.errfile = None # The underlying subprocess self.process = None + # The joined output + self.output = None - def spawn(self, cmd, cwd, timer): - use_shell = self.tester.specs["use_shell"] + def spawn(self, timer): + tester = self.job.getTester() + use_shell = tester.specs["use_shell"] + cmd, mpi_cmd = tester.getCommand(self.options) + if mpi_cmd is not None: + cmd = f'{mpi_cmd} {cmd}' + tester.setCommandRan(cmd) # Split command into list of args to be passed to Popen if not use_shell: cmd = shlex.split(cmd) self.process = None - try: - self.outfile = SpooledTemporaryFile(max_size=1000000) # 1M character buffer - self.errfile = SpooledTemporaryFile(max_size=100000) # 100K character buffer - - process_args = [cmd] - process_kwargs = {'stdout': self.outfile, - 'stderr': self.errfile, - 'close_fds': False, - 'shell': use_shell, - 'cwd': cwd} - # On Windows, there is an issue with path translation when the command is passed in - # as a list. - if platform.system() == "Windows": - process_kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP - else: - process_kwargs['preexec_fn'] = os.setsid - - # Special logic for openmpi runs - if self.hasOpenMPI(): - popen_env = os.environ.copy() - - # Don't clobber state - popen_env['OMPI_MCA_orte_tmpdir_base'] = self.getTempDirectory().name - # Allow oversubscription for hosts that don't have a hostfile - popen_env['PRTE_MCA_rmaps_default_mapping_policy'] = ':oversubscribe' - - popen_kwargs['env'] = popen_env + self.outfile = SpooledTemporaryFile(max_size=1000000) # 1M character buffer + self.errfile = SpooledTemporaryFile(max_size=100000) # 100K character buffer + + process_args = [cmd] + process_kwargs = {'stdout': self.outfile, + 'stderr': self.errfile, + 'close_fds': False, + 'shell': use_shell, + 'cwd': tester.getTestDir()} + # On Windows, there is an issue with path translation when the command is passed in + # as a list. + if platform.system() == "Windows": + process_kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP + else: + process_kwargs['preexec_fn'] = os.setsid + + # Special logic for openmpi runs + if self.hasOpenMPI(): + process_env = os.environ.copy() + + # Don't clobber state + process_env['OMPI_MCA_orte_tmpdir_base'] = self.getTempDirectory().name + # Allow oversubscription for hosts that don't have a hostfile + process_env['PRTE_MCA_rmaps_default_mapping_policy'] = ':oversubscribe' + + process_kwargs['env'] = process_env + try: self.process = subprocess.Popen(*process_args, **process_kwargs) - except: - print("Error in launching a new task", cmd) - traceback.print_exc() - raise + except Exception as e: + raise Exception('Error in launching a new task') from e timer.start() @@ -76,6 +86,7 @@ def wait(self, timer): self.errfile.flush() # store the contents of output, and close the file + self.output = util.readOutput(self.outfile, self.errfile, self.job.getTester()) self.outfile.close() self.errfile.close() @@ -95,7 +106,22 @@ def kill(self): pass def getOutput(self): - return util.readOutput(self.outfile, self.errfile, self) - - def isOutputReady(self): - return not self.outfile is None and self.outfile.closed and not self.errfile is None and self.errfile.closed + return self.output + + def sendSignal(self, signal): + # process.poll() returns the process's exit code if it has completed, + # and None if it is still running. This acts as a safety precaution + # against an infinite loop; this will always close. + while self.process.poll() is None: + + # tell() gives the current position in the file. If it is greater + # than zero, the binary has started running and writing output. If + # the output is blank, the moose_test binary hasn't actually started + # doing anything yet. If so, sleep briefly and check again. + if not self.outfile.tell(): + time.sleep(0.05) + + # If the output isn't blank, then we finally send the signal and exit the loop + else: + os.kill(self.process.pid, signal) + break diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index d8fb25d5a4a3..0c1198a81431 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -7,8 +7,7 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html -import re, os, json -import time +import re, os, json, time from timeit import default_timer as clock from TestHarness.StatusSystem import StatusSystem from TestHarness.FileChecker import FileChecker @@ -95,6 +94,10 @@ def __init__(self, tester, job_dag, options): # Initialize jobs with a holding status self.setStatus(self.hold) + # Whether or not we should forcefully report the status of this Job + # the next time report statuses + self.force_report_status = False + def getUpstreams(self): """ Return a list of all the jobs that needed to be completed before this job """ dag = self.getDAG() @@ -125,7 +128,7 @@ def getTestName(self): def getTestNameShort(self): """ Return the shorthand Test name """ - return self.getTestName().split('.')[1] + return self.__tester.getTestNameShort() def getPrereqs(self): """ Wrapper method to return the testers prereqs """ @@ -151,13 +154,17 @@ def getCommand(self): """ Wrapper method for returing command """ return self.__tester.getCommand(self.options) + def getCommandRan(self): + """ Wrapper method for returing command ran """ + return self.__tester.getCommandRan() + def getRunnable(self): """ Wrapper method to return getRunnable """ return self.__tester.getRunnable(self.options) - def getOutputFiles(self): + def getOutputFiles(self, options): """ Wrapper method to return getOutputFiles """ - return self.__tester.getOutputFiles() + return self.__tester.getOutputFiles(options) def getMaxTime(self): """ Wrapper method to return getMaxTime """ @@ -227,10 +234,10 @@ def run(self): self.__start_time = clock() self.timer.reset() - self.__tester.run(self.timer, self.options) + self.__tester.run(self, self.options, self.timer) self.__start_time = self.timer.starts[0] self.__end_time = self.timer.ends[-1] - self.__joined_out = self.__tester.joined_out + self.__joined_out = self.__tester.getOutput() if self.options.pedantic_checks and self.canParallel(): # Check if the files we checked on earlier were modified. @@ -359,7 +366,7 @@ def isQueued(self): return (_status == self.queued and self.isNoStatus()) \ or (_status in self.__finished_statuses and self.__tester.isQueued()) def isRunning(self): - return self.getStatus() == self.running + return self.getStatus() in self.job_status.getPendingStatuses() def isTimeout(self): return self.getStatus() == self.timeout def isPending(self): diff --git a/python/TestHarness/schedulers/QueueManager.py b/python/TestHarness/schedulers/QueueManager.py deleted file mode 100644 index 6be1b70d8fba..000000000000 --- a/python/TestHarness/schedulers/QueueManager.py +++ /dev/null @@ -1,446 +0,0 @@ -#* This file is part of the MOOSE framework -#* https://www.mooseframework.org -#* -#* All rights reserved, see COPYRIGHT for full restrictions -#* https://github.com/idaholab/moose/blob/master/COPYRIGHT -#* -#* Licensed under LGPL 2.1, please see LICENSE for details -#* https://www.gnu.org/licenses/lgpl-2.1.html - -import sys, os, json, shutil -from collections import namedtuple -from Scheduler import Scheduler -from TestHarness.StatusSystem import StatusSystem # Determin previous status - -class QueueManager(Scheduler): - """ - QueueManager is a Scheduler plugin responsible for allowing the testers to be scheduled via a - third-party queue system (like PBS). - - The QueueManager works by intercepting and altering the statuses of all but one job contained - in the group to a finished state. This affords us the behavior of only using the runner thread - pool once per group (see augmentJobs). - - Using this one unmodified job, the spec file involved is noted, and instructs the derived - scheduler how to launch this one single spec file (using --spec-file), along with any - supplied/allowable command line arguments (--re, --cli-args, --ignore, etc). - - The third-party queueing manager then executes `run_tests --spec-file /path/to/spec_file`. - - It is the results of this additional ./run_tests run, that is captured and presented to the user as - the finished result of the test. - """ - @staticmethod - def validParams(): - params = Scheduler.validParams() - return params - - def __init__(self, harness, params): - Scheduler.__init__(self, harness, params) - self.harness = harness - self.options = self.harness.getOptions() - self.__job_storage_file = self.harness.original_storage - self.__clean_args = None - self.__status_system = StatusSystem() - - def augmentJobs(self, Jobs): - """ - Filter through incomming jobs and figure out if we are launching them - or attempting to process finished results. - """ - if self.options.dry_run: - return - # Flatten the DAG. We want to easily iterate over all jobs produced by the spec file - Jobs.removeAllDependencies() - - # Perform cleanup operations and return if thats what the user wants - if self.options.queue_cleanup: - self._cleanupFiles(Jobs) - return - - # Create a namedtuple of frequently used information contained within Jobs, so we can - # more easily pass this information among our methods - job_list = Jobs.getJobs() - if job_list: - queue_data = namedtuple('JobData', ['jobs', 'job_dir', 'json_data', 'plugin']) - job_data = queue_data(jobs=Jobs, - job_dir=job_list[0].getTestDir(), - json_data=self.options.results_storage, - plugin=self.harness.scheduler.__class__.__name__) - - if self._isProcessReady(job_data): - self._setJobStatus(job_data) - - elif self._isLaunchable(job_data): - self._prepareJobs(job_data) - - def createQueueScript(self, job, template): - """ Write the launch script to disc """ - if self.options.dry_run: - return - # Get a list of prereq tests this test may have - try: - with open(self.params['queue_template'], 'r') as f: - content = f.read() - - with open(template['launch_script'], 'w') as queue_script: - - # Do all of the replacements for valid parameters - for key in template.keys(): - if key.upper() in content: - content = content.replace('<' + key.upper() + '>', str(template[key])) - - # Strip out parameters that were not supplied - for key in template.keys(): - if key.upper() not in content: - content = content.replace('<' + key.upper() + '>', '') - - queue_script.write(content) - except IOError as e: - print(e) - sys.exit(1) - - def reserveSlots(self, job, j_lock): - """ - Inherited method from the Scheduler to handle slot allocation. - QueueManager does not need an allocation system, so this method simply returns True - """ - return True - - def getBadArgs(self): - """ Arguments which should be removed from the launch script invoking ./run_tests """ - return [] - - def getBadKeyArgs(self): - """ Key/Value arguments which should be removed from the launch script invoking ./run_tests """ - return [] - - def getCores(self, job_data): - """ iterate over Jobs and collect the maximum core requirement from the group of jobs which will run """ - slots = 1 - for job in [x for x in job_data.jobs.getJobs() if not x.isSkip()]: - slots = max(slots, job.getSlots()) - - return slots - - def getMaxTime(self, job_data): - """ iterate over Jobs and increment the total allowable time needed to complete the entire group """ - total_time = 0 - for job in [x for x in job_data.jobs.getJobs() if not x.isSkip()]: - total_time += int(job.getMaxTime()) - - return total_time - - def addDirtyFiles(self, job, file_list=[]): - """ append list of files which will be generated by derived scheduler """ - _dirty_files = self.getDirtyFiles(job) - file_list.extend(_dirty_files) - file_list = list(set(file_list)) - job.addMetaData(DIRTY_FILES=file_list) - - def getDirtyFiles(self, job): - """ return list of files not indigenous to the repository which was created by third party schedulers """ - return job.getMetaData().get('DIRTY_FILES', []) - - def cleanAndModifyArgs(self): - """ - Filter out any arguments that will otherwise break the TestHarness when launched _within_ - the third party scheduler (such as --pbs) - """ - # return cached args if we have already produced clean args - if not self.__clean_args: - current_args = list(sys.argv[1:]) - - # Ask derived schedulers for any additional args we should strip from sys.args - bad_args = self.getBadArgs() - bad_keyword_args = self.getBadKeyArgs() - - # Split keyword args so we can match/remove them (the key, and its value pair) - key_value_args = [x for x in current_args if '=' in x] - for arg in key_value_args: - current_args.remove(arg) - current_args.extend(arg.split('=')) - - # Note: we are removing cli-args/ignore because we need to re-encapsulate them below - bad_keyword_args.extend(['--spec-file', '-i', '--cli-args', '-j', '-l', '-o', '--output-dir', '--ignore', '--re']) - - # remove the key=value pair argument - for arg in bad_keyword_args: - if arg in current_args: - key = current_args.index(arg) - del current_args[key:key+2] - - # Special: re-encapsulate --cli-args - if self.options.cli_args: - current_args.extend(['--cli-args', '"%s"' % self.options.cli_args]) - if self.options.ignored_caveats: - current_args.extend(['--ignore', '"%s"' % self.options.ignored_caveats]) - if self.options.reg_exp: - current_args.extend(['--re', '"%s"' % self.options.reg_exp]) - - # remove any specified positional arguments - for arg in bad_args: - if arg in current_args: - current_args.remove(arg) - - self.__clean_args = current_args - - return self.__clean_args - - def getRunTestsCommand(self, job, cpus): - """ return the command necessary to launch the TestHarness within the third party scheduler """ - - # Build ['/path/to/run_tests', '-j', '#'] - command = [os.path.join(self.harness.run_tests_dir, 'run_tests'), - '-j', str(cpus)] - - # get current sys.args we are allowed to include when we launch run_tests - args = list(self.cleanAndModifyArgs()) - - # Build [, '--spec-file' ,/path/to/tests', '-o', '/path/to', '--sep-files'] - args.extend(['--spec-file', - os.path.join(job.getTestDir(), - self.options.input_file_name), - '-o', job.getTestDir(), - '--sep-files']) - - # Build [, ] - command.extend(args) - - return command - - def hasQueuingFailed(self, job_data): - """ Determine if the third party scheduler killed the job prematurely """ - return False - - def _isProcessReady(self, job_data): - """ - Return bool on `run_tests --spec_file` submission results being available. Due to the - way the TestHarness writes to this results file (when the TestHarness exits), this file, - when available, means every test contained therein is finished in some form or another. - - If the result file does not exist, determine if it ever will exist. Tests which can fall - into this group, are those which were: skipped, deleted, silent, etc during the initial - launch phase. - """ - # No session file. Return immediately. - if not job_data.json_data.get(job_data.job_dir, False): - return False - - is_ready = True - # Job group exists in queue session and was apart of the queueing process - job_meta = job_data.json_data.get(job_data.job_dir, {}) - scheduler = job_data.json_data.get('SCHEDULER', '') - if job_meta: - # result file exists (jobs are finished) - if os.path.exists(os.path.join(job_data.job_dir, self.__job_storage_file)): - pass - - # ask derived scheduler if this job has failed - elif self.hasQueuingFailed(job_data): - for job in job_data.jobs.getJobs(): - job.setStatus(job.error) - is_ready = False - - # result does not yet exist but will in the future - else: - for job in job_data.jobs.getJobs(): - tester = job.getTester() - if tester.isSilent(): - continue - - status, message, caveats = job.previousTesterStatus(self.options, job_data.json_data) - if status == self.__status_system.skip or status == self.__status_system.no_status: - tester.setStatus(status, 'SKIP') - else: - tester.setStatus(status, job_meta[scheduler]['STATUS']) - if caveats: - tester.addCaveats(caveats) - status_message = tester.getStatusMessage() - - # This single job will enter the runner thread pool - if status_message == "LAUNCHING": - tester.setStatus(tester.queued) - - is_ready = False - - # Job group not originally launched - else: - for job in job_data.jobs.getJobs(): - tester = job.getTester() - status, message, caveats = job.previousTesterStatus(self.options, job_data.json_data) - tester.setStatus(status, message) - if caveats: - tester.addCaveats(caveats) - - if tester.isNoStatus(): - tester.setStatus(tester.silent) - is_ready = False - - if not is_ready: - for job in job_data.jobs.getJobs(): - job.setStatus(job.finished) - - return is_ready - - def _isLaunchable(self, job_data): - """ bool if jobs are ready to launch """ - # results data exists (set during scheduler plugin initialization), so do no launch again - if job_data.json_data.get(job_data.job_dir, False): - return False - - return True - - def _prepareJobs(self, job_data): - """ - Prepare jobs for launch. - - Grab an arbitrary job and record any necessary information the third party - queueing systems requires for launch (walltime, ncpus, etc). Set all other - jobs to a finished state. The arbitrary job selected will be the only job - which enters the runner thread pool, and executes the commands neccessary - for job submission. - """ - job_list = job_data.jobs.getJobs() - - # Clear any caveats set (except skips). As they do not apply during job submission - for job in [x for x in job_list if not x.isSkip()]: - job.clearCaveats() - - if job_list: - launchable_jobs = [x for x in job_list if not x.isFinished()] - if launchable_jobs: - executor_job = job_list.pop(job_list.index(launchable_jobs.pop(0))) - scheduler_meta = {job_data.plugin : {'QUEUEING_NCPUS' : self.getCores(job_data), - 'QUEUEING_MAXTIME' : self.getMaxTime(job_data)} - } - self.options.results_storage[executor_job.getTestDir()] = scheduler_meta - - executor_job.setStatus(executor_job.hold) - for job in launchable_jobs: - tester = job.getTester() - tester.setStatus(tester.queued, 'LAUNCHING') - job.setStatus(job.finished) - - def _prevJobGroupFinished(self, jobs): - """ Loop through jobs and return immediately if any one job has a finished status """ - for job in jobs: - # ignore detection of skipped/silent/deleted finished statuses. - if job.isSilent() or job.isSkip(): - continue - (key, value) = job.getTestDir(), job.getTestName() - previous_status = self.__status_system.createStatus(self.options.results_storage[key][value]['STATUS']) - if (self.__status_system.isValid(previous_status) - and previous_status not in self.__status_system.getPendingStatuses()): - return True - return False - - def _setJobStatus(self, job_data): - """ - Read the json results file for the finished submitted job group, and match our - job statuses with the results found therein. - """ - job_list = job_data.jobs.getJobs() - if job_list: - testdir_json = os.path.join(job_data.job_dir, self.__job_storage_file) - - with open(testdir_json, 'r') as f: - try: - # Determine if we've already recorded the results for this job group - if self._prevJobGroupFinished(job_list): - results = self.options.results_storage - else: - results = json.load(f) - except ValueError: - print('Unable to parse json file: %s' % (testdir_json)) - sys.exit(1) - - group_results = results[job_data.job_dir] - - # Continue to store previous third-party queueing data - job_meta = self.options.results_storage[job_data.job_dir] - job_list[0].addMetaData(**{job_data.plugin : job_meta[job_data.plugin]}) - job_meta[job_data.plugin]['STATUS'] = 'FINISHED' - - for job in job_list: - # Perhaps the user is filtering this job (--re, --failed-tests, etc) - tester = job.getTester() - job.setStatus(job.finished) - if tester.isSilent() or tester.isSkip(): - continue - elif self.options.failed_tests and tester.isPass(): - tester.setStatus(tester.silent) - continue - - if group_results.get(job.getTestName(), {}): - job_results = group_results[job.getTestName()] - status, message, caveats = job.previousTesterStatus(self.options, results) - tester.setStatus(status, message) - if caveats: - tester.addCaveats(caveats) - - # Recover useful job information from job results - job.setPreviousTime(job_results['TIMING']) - - # Read output file (--sep-files-ok|fail) - if job.getOutputFile() and os.path.exists(job.getOutputFile()): - self.addDirtyFiles(job, [job.getOutputFile()]) - if (self.options.reg_exp - or self.options.failed_tests - or self.options.verbose) and not self.options.quiet: - with open(job.getOutputFile(), 'r') as outfile: - job.setOutput(outfile.read()) - else: - job.setOutput(f'See error in file: {job.getOutputFile()}') - else: - job.setOutput('Output file is not available, or was never generated') - - # This is a newly added test in the spec file, which was not a part of original launch - else: - tester.addCaveats('not originally launched') - tester.setStatus(tester.skip) - - def _setSilentForClean(self, Jobs): - """ silence and set a finished status for all testers """ - job_list = Jobs.getJobs() - for job in job_list: - tester = job.getTester() - tester.setStatus(tester.silent) - job.setStatus(job.finished) - return job_list - - def _cleanupFiles(self, Jobs): - """ perform cleanup operations """ - job_list = self._setSilentForClean(Jobs) - top_job_key = job_list[0].getTestDir() - plugin = self.harness.scheduler.__class__.__name__ - - # Top Job (entire TestDir group) not originally part of what was launched - # (not launched due to: --re, -i --spec-file) - if top_job_key not in self.options.results_storage.keys(): - return - # All jobs ended up being skipped in this group - # (compiler!=gcc, heavy, petsc_version, etc) - elif plugin not in self.options.results_storage[top_job_key].keys(): - return - - # Build file_list with files we should delete - file_list = [os.path.join(top_job_key, self.options.results_file)] - job_meta = self.options.results_storage[top_job_key] - scheduler_meta = job_meta[plugin] - file_list.extend(scheduler_meta.get('DIRTY_FILES', [])) - for test_dir_key, meta in job_meta.items(): - if type(meta) == type({}) and 'META_DATA' in meta.keys(): - file_list.extend(meta["META_DATA"].get("DIRTY_FILES", [])) - - # Delete files generated by jobs and the scheduler - for dirty_file in file_list: - # Safty check. Any indigenous file generated by QueueManager should only exist in the tester directory - if os.path.dirname(dirty_file) == top_job_key: - try: - if os.path.isdir(dirty_file): - shutil.rmtree(dirty_file) - else: - os.remove(dirty_file) - except OSError: - pass diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 1aae116fc2c6..6f31361f684e 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -7,214 +7,340 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html -import os, sys, re, json -from QueueManager import QueueManager -from TestHarness import util # to execute qsub -import math # to compute node requirement +import os, sys, re, json, socket, datetime, threading +from RunParallel import RunParallel +from TestHarness.runners.PBSRunner import PBSRunner +from timeit import default_timer as clock from PBScodes import * +import paramiko + +import jinja2 +from jinja2 import meta ## This Class is responsible for maintaining an interface to the PBS scheduling syntax -class RunPBS(QueueManager): +class RunPBS(RunParallel): @staticmethod def validParams(): - params = QueueManager.validParams() + params = RunParallel.validParams() params.addParam('queue_template', os.path.join(os.path.abspath(os.path.dirname(__file__)), 'pbs_template'), "Location of the PBS template") return params + class PBSJob: + """ + Structure that represents the cached information about a PBS job + """ + def __init__(self, id, command): + # The PBS job identifier + self.id = id + # Whether or not this job is done; here done doesn't mean if it + # was successful or not, just if it is not running/queued anymore + self.done = False + # The exit code of the command that was ran (if any) + self.exit_code = None + # The job state as defined by PBS + self.state = None + # The command that was ran within the qsub script + self.command = command + # Whether or not this job was killed; used so what we don't + # bother killing a job multiple times + self.killed = False + def __init__(self, harness, params): - QueueManager.__init__(self, harness, params) + RunParallel.__init__(self, harness, params) self.params = params - self.harness = harness - self.options = self.harness.getOptions() - - def getBadKeyArgs(self): - """ arguments we need to remove from sys.argv """ - return ['--pbs'] - - def _readJobOutput(self, output_file, N=5): - """ return last few lines in output_file for job group """ - output = [] - if os.path.exists(output_file): - with open(output_file, 'r') as outfile: - for line in (outfile.readlines() [-N:]): - output.append(line) - output.append(f'Last {N} lines read. Full output file available at:\n{output_file}') - return '\n'.join(output) - - def hasQueuingFailed(self, job_data): - """ Determine if PBS killed the job prematurely """ - queue_plugin = self.__class__.__name__ - jobs = job_data.jobs.getJobs() - meta_data = job_data.json_data.get(jobs[0].getTestDir()) - launch_id = meta_data.get(queue_plugin, {}).get('ID', '').split('.')[0] - output_file = os.path.join(jobs[0].getTestDir(), 'qsub.output') - - # Job was never originally launched - if not meta_data.get(queue_plugin, False) or not launch_id: - return False - - # Job ran to completion - elif os.path.exists(os.path.join(jobs[0].getTestDir(), '.previous_test_results.json')): - return False - - ### Job has some other status ### - - # Check qstat for current status - qstat_command_result = util.runCommand(f'qstat -xf -F json {launch_id}') - - # Catch json parsing errors - try: - json_out = json.loads(qstat_command_result) - pbs_server = json_out['pbs_server'] - job_meta = json_out['Jobs'][f'{launch_id}.{pbs_server}'] - - # JobID no longer exists (stale after 1 week) - except json.decoder.JSONDecodeError: - # Job did not run to completion (no .previous_test_results.json file exists) - if os.path.exists(output_file): - qstat_command_result = (f'ERROR: {self._readJobOutput(output_file)}' - '\n\nMore information available in\n' - f' {output_file}\n') - - # Failed parse, and no output file. Perhaps the PBS job was canceled, deleted, etc - else: - qstat_command_result = ('ERROR: TestHarness encountered an error while' - f'determining what to make of PBS JobID {launch_id}:\n' - f'{qstat_command_result}') - - # Handle a qstat execution failure - if qstat_command_result.find('ERROR') != -1: - for job in job_data.jobs.getJobs(): - job.setOutput(f'ERROR invoking `qstat`:\n{qstat_command_result}') - job.setStatus(job.error, 'QSTAT') - return True - - # Use qstat json output to examine current status - job_result = job_meta.get('Exit_status', False) - - # Set the status as seen by qstat - meta_data[self.__class__.__name__]['STATUS'] = PBS_STATUSES[job_meta['job_state']] - - # Woops. This job was killed by PBS for some reason - if job_result and str(job_result) in PBS_User_EXITCODES.keys(): - output = f'{self._readJobOutput(output_file)}\n{PBS_User_EXITCODES[str(job_result)]}' - for job in jobs: - job.setOutput(output) - job.addCaveats(f'PBS ERROR: {job_result}') - return True - - # Capture TestHarness exceptions - elif job_result and job_result != "0": - if os.path.exists(output_file): - with open(output_file, 'r') as f: - output_string = util.readOutput(f, None, jobs[0].getTester()) - jobs[0].setOutput(output_string) - # Add a caveat to each job, explaining that one of the jobs caused a TestHarness exception - for job in jobs: - job.addCaveats('TESTHARNESS EXCEPTION') - return True - - return False - - def _augmentTemplate(self, job): - """ populate qsub script template with paramaters """ - job_data = self.options.results_storage.get(job.getTestDir(), {}) - queue_meta = job_data.get(self.__class__.__name__, { self.__class__.__name__: {} }) - - template = {} - - # Launch script location - template['launch_script'] = os.path.join(job.getTestDir(), os.path.basename(job.getTestNameShort()) + '.qsub') - - # NCPUS - template['mpi_procs'] = queue_meta.get('QUEUEING_NCPUS', 1) - - # Compute node requirement - if self.options.pbs_node_cpus and template['mpi_procs'] > self.options.pbs_node_cpus: - nodes = template['mpi_procs']/self.options.pbs_node_cpus - template['mpi_procs'] = self.options.pbs_node_cpus - else: - nodes = 1 - template['nodes'] = math.ceil(nodes) - - # Convert MAX_TIME to hours:minutes for walltime use - max_time = queue_meta.get('QUEUEING_MAXTIME', 1) - hours = int(int(max_time) / 3600) - minutes = int(int(max_time) / 60) % 60 - template['walltime'] = '{0:02d}'.format(hours) + ':' + '{0:02d}'.format(minutes) + ':00' - - # Job Name - template['job_name'] = os.path.basename(job.getTestNameShort()) - - # PBS Project group - template['pbs_project'] = '#PBS -P %s' % (self.options.queue_project) - - # PBS Queue - if self.options.queue_queue: - template['pbs_queue'] = '#PBS -q %s' % (self.options.queue_queue) - else: - template['pbs_queue'] = '' - - # Apply source command - if self.options.queue_source_command and os.path.exists(self.options.queue_source_command): - template['pre_command'] = 'source %s || exit 1' % (os.path.abspath(self.options.queue_source_command)) - else: - template['pre_command'] = '' - - # Redirect stdout to this location - template['output'] = os.path.join(job.getTestDir(), 'qsub.output') - - # Root directory - template['working_dir'] = self.harness.base_dir - - # Command - template['command'] = ' '.join(self.getRunTestsCommand(job, template['mpi_procs'])) - - return template - - def run(self, job): - """ execute qsub and return the launch id """ + self.options = harness.getOptions() + + # We don't want to report long running jobs here because we will + # manually set jobs as RUNNING as we notice their PBS status change + self.report_long_jobs = False + # We don't want to enforce the timeout here because we don't want to + # check it while the jobs are queued and PBS itself will handle the + # timeout because the job itself will be forcefully killed by PBS + self.enforce_timeout = False + + # Lock for accessing self.pbs_jobs + self.pbs_jobs_lock = threading.Lock() + # The last time statues were updated in getPBSJob() (if any) + self.pbs_jobs_status_timer = None + # How often to poll for status updates in getPBSJob() + self.pbs_jobs_update_interval = 10 + # Map of Job -> PBSJob + self.pbs_jobs = {} + + # The jump hostname for running PBS commands, if any + self.pbs_ssh_host = self.options.queue_host + # Setup the remote PBS host, if any (needed when submitted in a container) + self.pbs_ssh = None + # The lock for calling PBS commands via SSH, if any + self.pbs_ssh_lock = None + # Setup the jump host if provided + if self.pbs_ssh_host: + self.pbs_ssh_lock = threading.Lock() + self.pbs_ssh = paramiko.SSHClient() + self.pbs_ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + self.pbs_ssh.connect(self.pbs_ssh_host) + + # Load the PBS template + template_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'pbs_template') + self.default_template = open(template_path, 'r').read() + + if os.environ.get('APPTAINER_CONTAINER'): + if not self.pbs_ssh_host: + print('ERROR: --pbs-host must be set when using --pbs within apptainer') + sys.exit(1) + if not self.options.queue_source_command: + default_pre_source = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'pbs_source_apptainer') + self.options.queue_source_command = default_pre_source + print(f'INFO: Setting --pbs-pre-source={default_pre_source}') + if self.options.queue_source_command and not os.path.exists(self.options.queue_source_command): + print(f'ERROR: --pbs-pre-source path {self.options.queue_source_command} does not exist') + sys.exit(1) + + class CallPBSException(Exception): + """Exception class for providing extra context for PBS submission errors""" + def __init__(self, run_pbs, description, command, result=None): + message = f'{description}' + if run_pbs.pbs_ssh: + message += f' on host "{run_pbs.pbs_ssh_host}"' + message += f'\nCommand: {command}' + if result: + message += f'\n\nResult:\n{result}' + super().__init__(message) + + def callPBS(self, command): + """Wrapper for calling a PBS command (qsub, qstat, etc) that supports + SSH-ing to another host as needed when calling from within apptainer""" + if not self.pbs_ssh: + raise Exception('PBS not currently supported outside of a container') + + with self.pbs_ssh_lock: + try: + _, stdout, stderr = self.pbs_ssh.exec_command(command) + exit_code = stdout.channel.recv_exit_status() + result = ''.join(stdout.readlines()) + if exit_code != 0: + result += ''.join(stderr.readlines()) + except Exception as e: + raise RunPBS.CallPBSException(self, 'Failed to execute remote PBS command', command) from e + return exit_code, result.rstrip() + + def availableSlots(self, params): + return 250, False + + def getPBSJobName(self, job): + """Gets the name of the PBS job given a tester + + PBS doesn't like ":" or "/", hence changing them to "." + """ + return job.getTestName().replace(':', '.').replace('/', '.') + + def getPBSJobOutputPathPrefix(self, job): + """Gets the absolute path prefix for a PBS job""" + return os.path.join(job.getTestDir(), "pbs_" + job.getTestNameShort().replace('/', '.')) + + def getPBSJobOutputPath(self, job): + """Gets the absolute path for stdout/stderr for a PBS job""" + return self.getPBSJobOutputPathPrefix(job) + '.out' + + def getPBSJobSubmissionPath(self, job): + """Gets the aboslute path for the qsub script for a PBS job""" + return self.getPBSJobOutputPathPrefix(job) + '.qsub' + + def submitJob(self, job): + """Submits a PBS job""" tester = job.getTester() - if self.options.dry_run: - tester.setStatus(tester.success, 'DRY_RUN') - return - - template = self._augmentTemplate(job) - job_meta = self.options.results_storage.get(job.getTestDir(), { job.getTestDir() : {} }) - self.createQueueScript(job, template) - command = ' '.join(['qsub', template['launch_script']]) - launch_results = util.runCommand(command, job.getTestDir()) - - # List of files we need to clean up when we are done - dirty_files = [template['launch_script'], - template['output'], - os.path.join(job.getTestDir(), self.harness.results_file)] - - if launch_results.find('ERROR') != -1: - # The executor job failed (so fail all jobs in this group) - job_dag = job.getDAG() - - for other_job in [x for x in job_dag.topological_sort() if x != job]: - other_job.clearCaveats() - other_tester = other_job.getTester() - other_tester.setStatus(other_tester.fail, 'launch failure') - - # This is _only_ to make the failed message more useful - tester.specs['command'] = command - tester.setStatus(tester.fail, 'QSUB Group Failure') - job.setOutput(launch_results) - - else: - # While RunPBS believes this was a successful launch, perhaps this system's PBS system - # failed to launch for some other strange reason, and didn't error (above .find(ERROR) - # routine). In which case, it helps to set some 'past tense' grammar as our result - # in our '--pbs some_name' json file - job_meta[self.__class__.__name__].update({'ID' : launch_results, - 'QSUB_COMMAND' : command, - 'NCPUS' : template['mpi_procs'], - 'WALLTIME' : template['walltime'], - 'QSUB_OUTPUT' : template['output'], - 'STATUS' : 'PREVIOUSLY LAUNCHED', - 'DIRTY_FILES' : dirty_files}) - - tester.setStatus(tester.queued, 'LAUNCHING') + options = self.options + + # The qsub script we're going to write to + qsub_file = self.getPBSJobSubmissionPath(job) + # The combined stdout+stderr from the PBS job + output_file = self.getPBSJobOutputPath(job) + # Clean these two files + for file in [qsub_file, output_file]: + if os.path.exists(file): + os.remove(file) + + # Set up the command. We have special logic here for when we're using apptainer, + # where we need to put the MPI command outside of the apptainer call + command, mpi_command = tester.getCommand(options) + full_command = '' + if mpi_command: + full_command += f'{mpi_command} ' + APPTAINER_CONTAINER = os.environ.get('APPTAINER_CONTAINER') + if APPTAINER_CONTAINER: + apptainer_cmd = f'apptainer exec {APPTAINER_CONTAINER}' + full_command += f'{apptainer_cmd} ' + # The double quotes around the exec command here are important as apptainer exec + # doesn't work well with our command line arguments for some reason + full_command += f'"{command}"' + + num_procs = tester.getProcs(options) + num_threads = tester.getThreads(options) + walltime = str(datetime.timedelta(seconds=tester.getMaxTime())) + + # Set up the template + template_env = {'NAME': self.getPBSJobName(job), + 'SELECT': f'{num_procs}:mpiprocs=1:ncpus={num_threads}', + 'WALLTIME': walltime, + 'PROJECT': self.options.queue_project, + 'OUTPUT': output_file, + 'PLACE': 'scatter', + 'SUBMITTED_HOSTNAME': socket.gethostname(), + 'CWD': tester.getTestDir(), + 'COMMAND': full_command, + 'ENDING_COMMENT': self.getOutputEndingComment()} + if self.options.queue_queue: + template_env['QUEUE'] = self.options.queue_queue + if self.options.queue_source_command: + template_env['SOURCE_COMMAND'] = self.options.queue_source_command + + # Build the script + jinja_env = jinja2.Environment() + definition_template = jinja_env.from_string(self.default_template) + jinja_env.trim_blocks = True + jinja_env.lstrip_blocks = True + script = definition_template.render(**template_env) + + # Write the script + open(qsub_file, 'w').write(script) + + # qsub submission command + qsub_command = [f'cd {tester.getTestDir()}'] + qsub_command += [f'qsub {qsub_file}'] + qsub_command = '; '.join(qsub_command) + + # Set what we've ran for this job so that we can + # potentially get the context in an error + command_ran = qsub_command + if self.pbs_ssh: + command_ran = f"ssh {self.pbs_ssh_host} '{qsub_command}'" + job.getTester().setCommandRan(command_ran) + + # Do the submission; this is thread safe + # Eventually we might want to make this a pool so we can submit multiple + # jobs at the same time + exit_code, result = self.callPBS(qsub_command) + + # Nonzero return code + if exit_code != 0: + raise self.CallPBSException(self, 'qsub failed', qsub_command, result) + + # Make sure the job ID is something we'd expect + job_id = result + search = re.search('^[0-9]+.[a-zA-Z0-9_-]+$', job_id) + if not search: + raise self.CallPBSException(self, f'qsub has unexpected ID "{job_id}"', qsub_command) + + # Job has been submitted, so set it as queued + job.addCaveats(job_id) + self.setAndOutputJobStatus(job, job.queued) + + # Setup the job in the status map + with self.pbs_jobs_lock: + if job in self.pbs_jobs: + raise Exception('Job has already been submitted') + self.pbs_jobs[job] = self.PBSJob(job_id, full_command) + + def killJob(self, job): + """Kills a PBS job""" + with self.pbs_jobs_lock: + if job not in self.pbs_jobs: + return + pbs_job = self.pbs_jobs[job] + if pbs_job.done or pbs_job.killed: + return + job_id = self.pbs_jobs[job].id + + # Don't care about whether or not this failed + self.callPBS(f'qdel {job_id}') + + def killRemaining(self, keyboard=False): + """Kills all currently running PBS jobs""" + job_ids = [] + with self.pbs_jobs_lock: + for pbs_job in self.pbs_jobs.values(): + if not pbs_job.done: + job_ids.append(pbs_job.id) + + # Don't care about whether or not this failed + self.callPBS(f'qdel {" ".join(job_ids)}') + + with self.pbs_jobs_lock: + for pbs_job in self.pbs_jobs.values(): + if not pbs_job.done: + pbs_job.killed = True + + RunParallel.killRemaining(self, keyboard) + + def buildRunner(self, job, options): + return PBSRunner(job, options, self) + + def getOutputEndingComment(self): + """Gets the text we append to the PBS stderr+stdout file to desginate + that it is complete""" + return 'Completed TestHarness RunPBS job' + + def getPBSJob(self, job): + """Gets the PBSJob object for a given Job + + This will periodically update the PBSJob in a thread safe manner so + that we are not constantly calling qstat for every call.""" + + with self.pbs_jobs_lock: + # If this is the first time seeing this job, initialize it in the list + if job not in self.pbs_jobs: + raise Exception('Failed to get status for unsubmitted job') + + # Only update the statues periodically as this is called across threads + if self.pbs_jobs_status_timer is None or ((clock() - self.pbs_jobs_status_timer) > self.pbs_jobs_update_interval): + # Obtain the IDs of jobs that are active that we need to poll for + active_job_ids = [] + for job, pbs_job in self.pbs_jobs.items(): + if not pbs_job.done: + active_job_ids.append(pbs_job.id) + + # Poll for all of the jobs within a single call + cmd = ['qstat', '-xf', '-F', 'json'] + active_job_ids + exit_code, result = self.callPBS(' '.join(cmd)) + if exit_code != 0: + raise self.CallPBSException(self, 'Failed to get job status', cmd, result) + + # Register that we've updated the status + self.pbs_jobs_status_timer = clock() + + # Attempt to parse the status from the jobs + try: + json_result = json.loads(result) + job_results = json_result['Jobs'] + + for job, pbs_job in self.pbs_jobs.items(): + # We're only updating jobs that aren't done yet + if pbs_job.done: + continue + + # This job's result from the qstat command + job_result = job_results[pbs_job.id] + exit_code = job_result.get('Exit_status') + if exit_code is not None: + exit_code = int(exit_code) + state = job_result.get('job_state') + substate = job_result.get('substate') + terminated = int(substate) == 91 if substate else False + done = exit_code is not None or terminated + + # Get the job state, and report running if it switched to running + if state == 'R' and pbs_job.state != 'R': + self.setAndOutputJobStatus(job, job.running) + + # Update the PBSJob structure + pbs_job.done = done + pbs_job.exit_code = exit_code + pbs_job.state = state + + # Mark the job as terminated (past walltime, over resources, killed) + if terminated: + job.setStatus(job.error, 'PBS JOB TERMINATED') + except Exception as e: + raise self.CallPBSException(self, f'Failed to parse collective job status', cmd, result) from e + + return self.pbs_jobs[job] diff --git a/python/TestHarness/schedulers/RunParallel.py b/python/TestHarness/schedulers/RunParallel.py index d59b80a34867..b1f75206692b 100644 --- a/python/TestHarness/schedulers/RunParallel.py +++ b/python/TestHarness/schedulers/RunParallel.py @@ -32,7 +32,10 @@ def run(self, job): """ Run a tester command """ tester = job.getTester() - tester.setRunner(self.buildRunner(tester)) + + # Build and set the runner that will actually run the commands + # This is abstracted away so we can support local runners and PBS/slurm runners + tester.setRunner(self.buildRunner(job, self.options)) # Do not execute app, and do not processResults if self.options.dry_run: @@ -83,7 +86,7 @@ def run(self, job): output += '\n' + "#"*80 + '\nTester failed, reason: ' + tester.getStatusMessage() + '\n' self.setSuccessfulMessage(tester) - except Exception as e: + except Exception: output += 'Python exception encountered:\n\n' + traceback.format_exc() tester.setStatus(StatusSystem().error, 'TESTER EXCEPTION') @@ -96,13 +99,13 @@ def run(self, job): # Set testers output with modifications made above so it prints the way we want it job.setOutput(output) - def buildRunner(self, tester: Tester) -> Runner: + def buildRunner(self, job, options) -> Runner: """Builds the runner for a given tester This exists as a method so that derived schedulers can change how they run commands (i.e., for PBS and slurm) """ - return SubprocessRunner(tester) + return SubprocessRunner(job, options) def setSuccessfulMessage(self, tester): """ properly set a finished successful message for tester """ diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index 129ea518edeb..b577137541e6 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -59,12 +59,7 @@ def __init__(self, harness, params): # The Scheduler class can be initialized with no "max_processes" argument and it'll default # to a soft limit. If however a max_processes is passed we'll treat it as a hard limit. # The difference is whether or not we allow single jobs to exceed the number of slots. - if params['max_processes'] == None: - self.available_slots = 1 - self.soft_limit = True - else: - self.available_slots = params['max_processes'] # hard limit - self.soft_limit = False + self.available_slots, self.soft_limit = self.availableSlots(params) self.average_load = params['average_load'] @@ -114,6 +109,29 @@ def __init__(self, harness, params): # to be scheduled. KeyboardInterrupts are then handled by the thread pools. self.__waiting = False + # Whether or not to report long running jobs as RUNNING + self.report_long_jobs = True + # Whether or not to enforce the timeout of jobs + self.enforce_timeout = True + + # The job lock + self.j_lock = None + + def availableSlots(self, params): + """ + Get the number of available slots for processing jobs and + whether or not that limit is a soft or hard limit. + + Needed so that derived schedulers can modify this limit. + """ + if params['max_processes'] == None: + available_slots = 1 + soft_limit = True + else: + available_slots = params['max_processes'] # hard limit + soft_limit = False + return available_slots, soft_limit + def triggerErrorState(self): self.__error_state = True self.run_pool.close() @@ -173,6 +191,14 @@ def __sortAndLaunch(self): for (jobs, j_dag, j_lock) in sorted_jobs: self.queueJobs(jobs, j_lock) + def setAndOutputJobStatus(self, job, status): + """ + Sets a Job's status and forces the status to be output asap + """ + job.setStatus(status) + job.force_report_status = True + self.status_pool.apply_async(self.jobStatus, (job, None, self.j_lock)) + def waitFinish(self): """ Inform the Scheduler to begin running. Block until all jobs finish. @@ -221,7 +247,7 @@ def schedule(self, testers): # Instance our job DAG, create jobs, and a private lock for this group of jobs (testers) jobs = JobDAG(self.options) j_dag = jobs.createJobs(testers) - j_lock = threading.Lock() + self.j_lock = threading.Lock() # Allow derived schedulers access to the jobs before they launch self.augmentJobs(jobs) @@ -230,11 +256,11 @@ def schedule(self, testers): if j_dag.size() != len(testers): raise SchedulerError('Scheduler was going to run a different amount of testers than what was received (something bad happened)!') - with j_lock: + with self.j_lock: # As testers (jobs) finish, they are removed from job_bank self.__job_bank.update(j_dag.topological_sort()) # List of objects relating to eachother (used for thread locking this job group) - self.__dag_bank.append([jobs, j_dag, j_lock]) + self.__dag_bank.append([jobs, j_dag, self.j_lock]) # Store all scheduled jobs self.__scheduled_jobs.append(j_dag.topological_sort()) @@ -336,13 +362,19 @@ def jobStatus(self, job, jobs, j_lock): # Peform within a try, to allow keyboard ctrl-c try: with j_lock: - if job.isRunning(): - # already reported this job once before + # This job is set to force a status + force_status = job.force_report_status + + if force_status: + with self.activity_lock: + self.jobs_reported.add(job) + job.force_report_status = False + elif job.isRunning(): if job in self.jobs_reported: return # this job will be reported as 'RUNNING' - elif clock() - self.last_reported_time >= self.min_report_time: + if clock() - self.last_reported_time >= self.min_report_time: # prevent 'finished' caveat with options expecting to take lengthy amounts of time if (not self.options.sep_files and not self.options.ok_files @@ -354,7 +386,6 @@ def jobStatus(self, job, jobs, j_lock): with self.activity_lock: self.jobs_reported.add(job) - # TestHarness has not yet been inactive long enough to warrant a report else: # adjust the next report time based on delta of last report time @@ -408,16 +439,21 @@ def runJob(self, job, jobs, j_lock): with self.activity_lock: self.__active_jobs.add(job) - timeout_timer = threading.Timer(float(job.getMaxTime()), - self.handleTimeoutJob, - (job, j_lock,)) - - job.report_timer = threading.Timer(self.min_report_time, - self.handleLongRunningJob, - (job, jobs, j_lock,)) - - job.report_timer.start() - timeout_timer.start() + if self.enforce_timeout: + timeout_timer = threading.Timer(float(job.getMaxTime()), + self.handleTimeoutJob, + (job, j_lock,)) + timeout_timer.start() + else: + timeout_timer = None + + if self.report_long_jobs: + job.report_timer = threading.Timer(self.min_report_time, + self.handleLongRunningJob, + (job, jobs, j_lock,)) + job.report_timer.start() + else: + job.report_timer = None # We have a try here because we want to explicitly catch things like # python errors in _only_ the Job; exceptions that happen in the Tester @@ -428,14 +464,17 @@ def runJob(self, job, jobs, j_lock): with j_lock: job.setStatus(StatusSystem().error, 'JOB EXCEPTION') job.setOutput('Encountered an exception while running Job: %s' % (traceback.format_exc())) - timeout_timer.cancel() + + if timeout_timer: + timeout_timer.cancel() # Recover worker count before attempting to queue more jobs with self.slot_lock: self.slots_in_use = max(0, self.slots_in_use - job.getSlots()) # Stop the long running timer - job.report_timer.cancel() + if job.report_timer: + job.report_timer.cancel() # All done with j_lock: diff --git a/python/TestHarness/schedulers/pbs_source_apptainer b/python/TestHarness/schedulers/pbs_source_apptainer new file mode 100644 index 000000000000..30c6242e3d10 --- /dev/null +++ b/python/TestHarness/schedulers/pbs_source_apptainer @@ -0,0 +1 @@ +module load use.moose moose-dev-container diff --git a/python/TestHarness/schedulers/pbs_template b/python/TestHarness/schedulers/pbs_template index f3753c7ad741..98126ec1a151 100644 --- a/python/TestHarness/schedulers/pbs_template +++ b/python/TestHarness/schedulers/pbs_template @@ -1,16 +1,45 @@ #!/bin/bash -#PBS -N -#PBS -l select=:ncpus= -#PBS -l walltime= - - +#PBS -N {{ NAME }} +#PBS -l select={{ SELECT }} +#PBS -l walltime={{ WALLTIME }} +#PBS -P {{ PROJECT }} +{%- if QUEUE is defined %} +#PBS -q {{ QUEUE }} +{%- endif %} #PBS -j oe -#PBS -o -#PBS -l place=free - -JOB_NUM=${PBS_JOBID%\.*} +#PBS -o {{ OUTPUT }} +#PBS -l place={{ PLACE }} -export MV2_ENABLE_AFFINITY=0 +{%- if SOURCE_COMMAND is defined %} +# Set by the --pbs-pre-source TestHarness option +source {{ SOURCE_COMMAND }} || exit $? +{%- endif %} + +# Print a useful header +echo "################################################################################" +echo "Beginning TestHarness RunPBS job" +echo "Submitted hostname: {{ SUBMITTED_HOSTNAME }}" +echo "Hostname: $(hostname)" +echo "Time: $(date)" +echo "Directory: {{ CWD }}" +echo "Output: {{ OUTPUT }}" +echo "Command: {{ COMMAND }}" +module list +echo "################################################################################" + +# Move into the test directory +cd {{ CWD }} + +# Run the command +{{ COMMAND }} +# ...and capture the return code cause we're not done yet +return_code=$? + +# Append a recognizable string at the end of the output. We look +# for this string when parsing the output so that we can be sure +# that we have obtained all of the output +echo "{{ ENDING_COMMENT }}" + +# Exit with the real return code from the job that we ran +exit $return_code -cd - diff --git a/python/TestHarness/testers/AnalyzeJacobian.py b/python/TestHarness/testers/AnalyzeJacobian.py index 70ef3b9cf2a3..50942864ccfa 100644 --- a/python/TestHarness/testers/AnalyzeJacobian.py +++ b/python/TestHarness/testers/AnalyzeJacobian.py @@ -28,7 +28,7 @@ def validParams(): def __init__(self, name, params): FileTester.__init__(self, name, params) - def getOutputFiles(self): + def getOutputFiles(self, options): # analizejacobian.py outputs files prefixed with the input file name return [self.specs['input']] @@ -68,7 +68,7 @@ def getCommand(self, options): if len(specs['cli_args']): command += '--cli-args "' + (' '.join(specs['cli_args']) + '"') - return command + return command, None def processResults(self, moose_dir, options, output): @@ -88,3 +88,12 @@ def processResults(self, moose_dir, options, output): self.setStatus(self.fail, reason) return output + + def checkRunnable(self, options): + # We cannot rely on an external script running things with PBS + if options.pbs: + self.addCaveats('PBS NOT SUPPORTED') + self.setStatus(self.skip) + return False + + return FileTester.checkRunnable(self, options) diff --git a/python/TestHarness/testers/CSVDiff.py b/python/TestHarness/testers/CSVDiff.py index 48141a83e55d..7d1dfbce2db5 100644 --- a/python/TestHarness/testers/CSVDiff.py +++ b/python/TestHarness/testers/CSVDiff.py @@ -29,7 +29,7 @@ def validParams(): def __init__(self, name, params): FileTester.__init__(self, name, params) - def getOutputFiles(self): + def getOutputFiles(self, options): return self.specs['csvdiff'] # Check that override parameter lists are the same length diff --git a/python/TestHarness/testers/CheckFiles.py b/python/TestHarness/testers/CheckFiles.py index 78d0a04306c3..6cbae7e2f763 100644 --- a/python/TestHarness/testers/CheckFiles.py +++ b/python/TestHarness/testers/CheckFiles.py @@ -28,7 +28,7 @@ def __init__(self, name, params): if not (params.isValid('check_files') or params.isValid('check_not_exists')): raise Exception('Either "check_files" or "check_not_exists" must be supplied for a CheckFiles test') - def getOutputFiles(self): + def getOutputFiles(self, options): return self.specs['check_files'] + self.specs['check_not_exists'] def processResults(self, moose_dir, options, output): @@ -75,3 +75,12 @@ def processResults(self, moose_dir, options, output): self.setStatus(self.fail, reason) return output + + def checkRunnable(self, options): + # We cannot reliably check if files do not exist with a networked file system + if options.pbs and self.specs['check_not_exists']: + self.addCaveats('PBS NOT SUPPORTED') + self.setStatus(self.skip) + return False + + return FileTester.checkRunnable(self, options) diff --git a/python/TestHarness/testers/Exodiff.py b/python/TestHarness/testers/Exodiff.py index 8668756ee10d..80b9a8c20413 100644 --- a/python/TestHarness/testers/Exodiff.py +++ b/python/TestHarness/testers/Exodiff.py @@ -41,7 +41,7 @@ def __init__(self, name, params): if self.specs['map'] and self.specs['partial']: raise Exception("For the Exodiff tester, you cannot specify both 'map' and 'partial' as True") - def getOutputFiles(self): + def getOutputFiles(self, options): return self.specs['exodiff'] def processResultsCommand(self, moose_dir, options): diff --git a/python/TestHarness/testers/FileTester.py b/python/TestHarness/testers/FileTester.py index 73e1eb5eb4d8..c2ebc3176378 100644 --- a/python/TestHarness/testers/FileTester.py +++ b/python/TestHarness/testers/FileTester.py @@ -27,4 +27,4 @@ def __init__(self, name, params): def prepare(self, options): if self.specs['delete_output_before_running']: - util.deleteFilesAndFolders(self.getTestDir(), self.getOutputFiles()) + util.deleteFilesAndFolders(self.getTestDir(), self.getOutputFiles(options)) diff --git a/python/TestHarness/testers/ImageDiff.py b/python/TestHarness/testers/ImageDiff.py index 379266ec95bc..2e3ce0a2d958 100644 --- a/python/TestHarness/testers/ImageDiff.py +++ b/python/TestHarness/testers/ImageDiff.py @@ -33,7 +33,7 @@ def __init__(self, name, params): elif 'skimage' not in self.specs['required_python_packages']: self.specs['required_python_packages'] += ' skimage' - def getOutputFiles(self): + def getOutputFiles(self, options): return self.specs['imagediff'] def processResults(self, moose_dir, options, output): diff --git a/python/TestHarness/testers/PythonUnitTest.py b/python/TestHarness/testers/PythonUnitTest.py index 1a77cd6e9254..4ff1f301861c 100644 --- a/python/TestHarness/testers/PythonUnitTest.py +++ b/python/TestHarness/testers/PythonUnitTest.py @@ -46,4 +46,5 @@ def getCommand(self, options): else: cmd = "python3 -m unittest" + use_buffer + "-v " + test_case - return cmd + ' '.join(self.specs['cli_args']) + # We need to append PYTHONPATH here for running these within apptainer + return f'PYTHONPATH={self.getMooseDir()}/python ' + cmd + ' '.join(self.specs['cli_args']), None diff --git a/python/TestHarness/testers/RunApp.py b/python/TestHarness/testers/RunApp.py index ceff7a48361f..80074d2f0820 100644 --- a/python/TestHarness/testers/RunApp.py +++ b/python/TestHarness/testers/RunApp.py @@ -225,15 +225,16 @@ def getCommand(self, options): elif nthreads > 1: command = command + ' --n-threads=' + str(nthreads) + mpi_command = None if self.force_mpi or options.parallel or ncpus > 1: - command = f'{self.mpi_command} -n {ncpus} {command}' + # Arbitrary proxy command, but keep track of the command so that someone could use it later + if specs.isValid('command_proxy'): + raise Exception('no worky yet') + # command = command.replace('"', r'\"') + # command = f'RUNAPP_COMMAND="{command}" {os.path.join(specs["test_dir"], specs["command_proxy"])}' + mpi_command = f'{self.mpi_command} -n {ncpus}' - # Arbitrary proxy command, but keep track of the command so that someone could use it later - if specs.isValid('command_proxy'): - command = command.replace('"', r'\"') - return f'RUNAPP_COMMAND="{command}" {os.path.join(specs["test_dir"], specs["command_proxy"])}' - - return command + return command, mpi_command def testFileOutput(self, moose_dir, options, output): """ Set a failure status for expressions found in output """ @@ -348,3 +349,8 @@ def processResults(self, moose_dir, options, output): output += self.testExitCodes(moose_dir, options, output) return output + + def mustOutputExist(self): + if self.specs['should_crash']: + return self.getExitCode() != 0 + return self.getExitCode() == 0 diff --git a/python/TestHarness/testers/RunCommand.py b/python/TestHarness/testers/RunCommand.py index ff5e888c18b2..a380fa39bb14 100644 --- a/python/TestHarness/testers/RunCommand.py +++ b/python/TestHarness/testers/RunCommand.py @@ -24,7 +24,7 @@ def __init__(self, name, params): def getCommand(self, options): # Create the command line string to run - return self.command + return self.command, None def processResults(self, moose_dir, options, output): if self.getExitCode() == 77 : diff --git a/python/TestHarness/testers/RunException.py b/python/TestHarness/testers/RunException.py index bf6effe23c24..5d9a5790dbc3 100644 --- a/python/TestHarness/testers/RunException.py +++ b/python/TestHarness/testers/RunException.py @@ -38,11 +38,14 @@ def checkRunnable(self, options): return RunApp.checkRunnable(self, options) def prepare(self, options): - if self.getProcs(options) > 1: - file_paths = [] - for processor_id in range(self.getProcs(options)): - file_paths.append(self.name() + '.processor.{}'.format(processor_id)) - util.deleteFilesAndFolders(self.getTestDir(), file_paths, False) + if self.hasRedirectedOutput(options): + files = self.getRedirectedOutputFiles(options) + util.deleteFilesAndFolders(self.getTestDir(), files, False) + + def getOutputFiles(self, options): + if self.hasRedirectedOutput(options): + return self.getRedirectedOutputFiles(options) + return [] def processResults(self, moose_dir, options, output): # Exceptions are written to stderr, which can be interleaved so we normally redirect these diff --git a/python/TestHarness/testers/SignalTester.py b/python/TestHarness/testers/SignalTester.py index 1852308dbc7c..e2fa81b469c3 100644 --- a/python/TestHarness/testers/SignalTester.py +++ b/python/TestHarness/testers/SignalTester.py @@ -38,43 +38,13 @@ def __init__(self, name, params): f"a supported signal type. Currently supported signal types are:\n{', '.join(list(valid_signals.keys()))}") raise e - def send_signal(self): - """Function used to send a signal to the program automatically for testing purposes.""" + def checkRunnable(self, options): + if options.pbs: + self.addCaveats('PBS NOT SUPPORTED') + self.setStatus(self.skip) + return False - # Create a while loop that checks if the stdout buffer has any data in it, and then sends the signal once - # it knows that the moose_test binary is actually doing something. + return super().checkRunnable(self, options) - # process.poll() returns the process's exit code if it has completed, and None if it is still running. - # This acts as a safety precaution against an infinite loop -- this will always close. - while self.process.poll() is None: - - # tell() gives the current position in the file. If it is greater than zero, the binary - # has started running and writing output. - # if the output is blank, the moose_test binary hasn't actually started doing anything yet. - # if so, sleep briefly and check again. - if not self.outfile.tell(): - time.sleep(0.05) - - # if the output isn't blank, then we finally send the signal and exit the loop - else: - try: - os.kill(self.process.pid, self.signal) - break - except ProcessLookupError as e: - print("Unable to send signal to process. Has it already terminated?") - raise e - - def runCommand(self, timer, options): - """ - Helper method for running external (sub)processes as part of the tester's execution. This - uses the tester's getCommand and getTestDir methods to run a subprocess. The timer must - be the same timer passed to the run method. Results from running the subprocess is stored - in the tester's output and exit_code fields. - """ - - exit_code = super().spawnProcessFromOptions(timer, options) - if exit_code: # Something went wrong - return - - self.send_signal() - super().finishAndCleanupSubprocess(timer) + def postSpawn(self): + self._runner.sendSignal(self.signal) diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index 9fc733bd4ba5..63e77f3891e3 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -7,16 +7,14 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html -import platform, re, os, sys, pkgutil, shutil, shlex +import re, os, sys, shutil import mooseutils from TestHarness import util from TestHarness.StatusSystem import StatusSystem from TestHarness.runners.Runner import Runner from FactorySystem.MooseObject import MooseObject -from tempfile import SpooledTemporaryFile, TemporaryDirectory +from tempfile import TemporaryDirectory from pathlib import Path -import subprocess -from signal import SIGTERM class Tester(MooseObject): """ @@ -167,6 +165,10 @@ def __init__(self, name, params): # The object that'll actually do the run self._runner = None + # The command that we actually ended up running; this may change + # depending on the runner which might inject something + self.command_ran = None + def getTempDirectory(self): """ Gets a shared temp directory that will be cleaned up for this Tester @@ -254,6 +256,10 @@ def getTestName(self): """ return test name """ return self.specs['test_name'] + def getTestNameShort(self): + """ return test short name (not including the path) """ + return self.specs['test_name_short'] + def getPrereqs(self): """ return list of prerequisite tests this test depends on """ return self.specs['prereq'] @@ -298,7 +304,7 @@ def getInputFileContents(self): """ return the contents of the input file applicable to this Tester """ return None - def getOutputFiles(self): + def getOutputFiles(self, options): """ return the output files if applicable to this Tester """ return [] @@ -348,10 +354,6 @@ def getSlots(self, options): """ return number of slots to use for this tester """ return self.getThreads(options) * self.getProcs(options) - def getCommand(self, options): - """ return the executable command that will be executed by the tester """ - return '' - def hasOpenMPI(self): """ return whether we have openmpi for execution @@ -372,48 +374,38 @@ def hasOpenMPI(self): return False return Path(which_mpiexec).parent.absolute() == Path(which_ompi_info).parent.absolute() - def spawnProcessFromOptions(self, timer, options): - """ - Spawns a process based on given options, sets output and error files, - and starts timer. + def getCommand(self, options): """ - cmd = self.getCommand(options) - cwd = self.getTestDir() + Return the command that the Tester wants ran - # Verify that the working directory is available right before we execute. - if not os.path.exists(cwd): - # Timers must be used since they are directly indexed in the Job class - timer.start() - self.setStatus(self.fail, 'WORKING DIRECTORY NOT FOUND') - timer.stop() - return 1 - - # Spawn the process - self._runner.spawn(cmd, cwd, timer) + We say "wants ran" here because the Runner may inject something + within the command, for example when running within a container. + Due to this distinction, you can obtain the command that was + actually ran via getCommandRan() - return 0 - - def finishAndCleanupProcess(self, timer): - """ - Waits for the current process to finish, stops the timer, and - cleans up. + The first value is the argument without a parallel executor + (something like mpiexec -n ...) and the second value is + the parallel argument (if any, otherwise None) """ + return None, None - self._runner.wait(timer) - - def runCommand(self, timer, options): + def setCommandRan(self, command): """ - Helper method for running external (sub)processes as part of the tester's execution. This - uses the tester's getCommand and getTestDir methods to run a subprocess. The timer must - be the same timer passed to the run method. Results from running the subprocess is stored - in the tester's output and exit_code fields. + Sets the command that was actually ran. + + This is needed to account for running commands within containers + and needing to run an additional command up front (i.e., with + a pbs or slurm scheduler calling something like qsub) """ + self.command_ran = command - exit_code = self.spawnProcessFromOptions(timer, options) - if exit_code: # Something went wrong - return + def getCommandRan(self): + """ + Gets the command that was actually ran. - self.finishAndCleanupProcess(timer) + See setCommandRan() for the distinction. + """ + return self.command_ran def killCommand(self): """ @@ -424,7 +416,7 @@ def killCommand(self): # Try to clean up anything else that we can self.cleanup() - def run(self, timer, options): + def run(self, job, options, timer): """ This is a method that is the tester's main execution code. Subclasses can override this method with custom code relevant to their specific testing needs. By default this method @@ -433,7 +425,28 @@ def run(self, timer, options): if needed. The run method is responsible to call the start+stop methods on timer to record the time taken to run the actual test. start+stop can be called multiple times. """ - self.runCommand(timer, options) + # Verify that the working directory is available right before we execute. + if not os.path.exists(self.getTestDir()): + # Timers must be used since they are directly indexed in the Job class + timer.start() + self.setStatus(self.fail, 'WORKING DIRECTORY NOT FOUND') + timer.stop() + return + + # Spawn the process + try: + self._runner.spawn(timer) + except Exception as e: + raise Exception('Failed to spawn process') from e + + # And wait for it to complete + self._runner.wait(timer) + + def postSpawn(self): + """ + Entry point for after the process has been spawned + """ + return def processResultsCommand(self, moose_dir, options): """ method to return the commands (list) used for processing results """ @@ -469,6 +482,18 @@ def clearCaveats(self): self.__caveats = set([]) return self.getCaveats() + def mustOutputExist(self): + """ Whether or not we should check for the output once it has ran + + We need this because the PBS/slurm Runner objects, which use + networked file IO, need to wait until the output is available on + on the machine that submitted the jobs. A good example is RunException, + where we should only look for output when we get a nonzero return + code.""" + return self.getExitCode() == 0 + + # need something that will tell us if we should try to read the result + def checkRunnableBase(self, options): """ Method to check for caveats that would prevent this tester from diff --git a/python/TestHarness/testers/bench.py b/python/TestHarness/testers/bench.py index 878563eeb417..41799794409c 100644 --- a/python/TestHarness/testers/bench.py +++ b/python/TestHarness/testers/bench.py @@ -112,6 +112,10 @@ def run(self, timer=None, timeout=300): shutil.rmtree(tmpdir) + def getOutput(self): + # this has never produced any output :( + return '' + class SpeedTest(Tester): @staticmethod def validParams(): @@ -153,7 +157,7 @@ def checkRunnable(self, options): return True # override - def run(self, timer, options): + def run(self, job, options, timer): p = self.params if not self.check_only and options.method not in ['opt', 'oprof', 'dbg']: raise ValueError('cannot run benchmark with "' + options.method + '" build') diff --git a/python/TestHarness/util.py b/python/TestHarness/util.py index f89ae79423af..e42832f4d174 100644 --- a/python/TestHarness/util.py +++ b/python/TestHarness/util.py @@ -15,7 +15,7 @@ import yaml import sys -TERM_COLS = int(os.getenv('MOOSE_TERM_COLS', '110')) +TERM_COLS = int(os.getenv('MOOSE_TERM_COLS', os.get_terminal_size().columns * 5/6)) TERM_FORMAT = os.getenv('MOOSE_TERM_FORMAT', 'njcst') MOOSE_OPTIONS = { From f730ae384a1edf3fc52d3c2599e5a21367d26dae Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 7 May 2024 10:21:19 -0600 Subject: [PATCH 003/243] Remove SpeedTest refs #27562 --- .../content/application_development/index.md | 2 - .../performance_benchmarking.md | 165 ------- .../step07_parallel.md | 2 - modules/doc/content/infrastructure/index.md | 1 - python/TestHarness/TestHarness.py | 9 +- python/TestHarness/testers/bench.py | 408 ------------------ .../tests/kernels/simple_diffusion/speedtests | 17 - .../simple_transient_diffusion/speedtests | 13 - 8 files changed, 1 insertion(+), 616 deletions(-) delete mode 100644 modules/doc/content/application_development/performance_benchmarking.md delete mode 100644 python/TestHarness/testers/bench.py delete mode 100644 test/tests/kernels/simple_diffusion/speedtests delete mode 100644 test/tests/kernels/simple_transient_diffusion/speedtests diff --git a/modules/doc/content/application_development/index.md b/modules/doc/content/application_development/index.md index f915e35b0ec5..0b8c72d65ab6 100644 --- a/modules/doc/content/application_development/index.md +++ b/modules/doc/content/application_development/index.md @@ -20,8 +20,6 @@ These documentation pages are meant to be used by developers who are developing [Test System](/test_system.md) - How to create/maintain tests for your application -[Performance Benchmarking](/performance_benchmarking.md) - How to perform benchmarking - [Profiling](/profiling.md) - How to profile your application in order to determine what functions are hogging compute time. [Code Coverage](/coverage.md) - How to add automatic code coverage to your application, and use it in your development workflow diff --git a/modules/doc/content/application_development/performance_benchmarking.md b/modules/doc/content/application_development/performance_benchmarking.md deleted file mode 100644 index 1950bf72d33e..000000000000 --- a/modules/doc/content/application_development/performance_benchmarking.md +++ /dev/null @@ -1,165 +0,0 @@ -# Performance Benchmarking - -Utilities for doing performance benchmarking of MOOSE-based applications are included in the main -MOOSE repository. These utilities provide functionality for benchmarking and tracking MOOSE -performance. They can be used to run benchmarks, generate trend visualizations, and look at stats -comparing benchmarks between various revisions. The following sections describe how to setup a -benchmark machine and use it to run benchmarks and visualize results. - -## Tuning a Benchmarking Machine - -In order to obtain accurate results, you need to run the benchmark process(es) -as close to isolated as possible. On a linux system, you should e.g. use cpu -isolation via setting kernel boot parameters: - -```text -isolcpus=[n] rcu_nocbs=[n] -``` - -in your boot loader (e.g. grub). The benchmarking tools/scripts in MOOSE should automatically -detect CPU isolation on Linux and schedule benchmark jobs to those CPUs. You should also disable -any turbo functionality. For example on `intel_pstate` driver cpus: - -```text -$ echo "1" > /sys/devices/system/cpu/intel_pstate/no_turbo -``` - -You will also want to turn off any hyperthreading for cores you use for benchmarking. You can do -this in the bios or by something like: - -```text -$ echo "0" > /sys/devices/system/cpu/cpu[n]/online -``` - -for each hyperthread core you want running - you can look in `/proc/cpuinfo` for pairs of cpus -that have the same core id turning off one of the pair. These will need to be done on every boot. -You can use the sysfsutils package and its `/etc/sysfs.conf` configuration file to do this -persistently on boot - i.e.: - -```text -devices/system/cpu/intel_pstate/no_turbo = 1 -devices/system/cpu/cpu3/online = 0 -devices/system/cpu/cpu5/online = 0 -``` - -## Test Harness Benchmarks - -Benchmarks can be run through the test harness (i.e. using the `run_tests` script) by doing -e.g. `./run_tests --run speedtests`. When this is done, the test harness looks for test spec -files named `speedtests` just like the `tests` files that contain regular moose test details. -The format for these files is: - -```text -[Benchmarks] - [benchmark-name] - type = SpeedTest - input = input-file-name.i - cli_args = '--an-arg=1 a/hit/format/cli/arg=foo' - # optional: - min_runs = 15 # default 40 - max_runs = 100 # default 400 - cumulative_dur = 100 # default 60 sec - [] - - [./benchmark2-name] - type = SpeedTest - input = another-input-file-name.i - cli_args = 'some/cli/arg=bar' - [] - - # ... -[] -``` - -After being run, benchmark data are stored in a sqlite database (default name -`speedtests.sqlite`). When the test harness is run without the `--run speedtests` flag, tests -described in `speedtests` files are run in *check-only* mode where moose just checks that their -input files are well-formed and parse correctly without actually running them. - - -## Manual/Direct Benchmarks - -The `[moose-repo]/scripts/benchmark.py` script can be used to manually list and directly run benchmarks without the -test harness (for hacking, debugging, etc.). To do this, the script reads a `bench.list` text -file that specifies which input files should be run and corresponding (benchmark) names for them -along with any optional arguments. The `bench.list` file has the following format: - -```text -[benchmarks] - [./simple_diffusion_refine3] - binary = test/moose_test-opt - input = test/tests/kernels/simple_diffusion/simple_diffusion.i - cli_args = 'Mesh/uniform_refine=3' - [../] - [./simple_diffusion_refine4] - binary = test/moose_test-opt - input = test/tests/kernels/simple_diffusion/simple_diffusion.i - cli_args = 'Mesh/uniform_refine=4' - [../] - [./simple_diffusion_ref5] - binary = test/moose_test-opt - input = test/tests/kernels/simple_diffusion/simple_diffusion.i - cli_args = 'Mesh/uniform_refine=5' - [../] - # ... add as many as you want -[] -``` - -To run the manual benchmarks directly, do this: - -```text -$ ./scripts/benchmark.py --run -``` - -When benchmarks are run, the binaries specified in `bench.list` must already exist. Benchmark -data are then stored in a sqlite database (default name `speedtests.sqlite`). You can specify -the minimum number of runs for each benchmark problem/simulation with the `--min-runs` (default -10). Each benchmark will be run as many times as possible within 1 minute (customizable via the -`--cum-dur` flag) or the specified minimum number of times (whichever is larger). - -## Analyzing Results - -Regardless of how you ran the benchmarks (either by this script or using the test harness), MOOSE -revisions with available benchmark data can be listed (from the database) by running: - -```text -$ ./benchmark.py --list-revs -44d2f3434b3346dc14fc9e86aa99ec433c1bbf10 2016-09-07 19:36:16 -86ced0d0c959c9bdc59497f0bc9324c5cdcd7e8f 2016-09-08 09:29:17 -447b455f1e2d8eda649468ed03ef792504d4b467 2016-09-08 09:43:56 -... -``` - -To look at stats comparing benchmark data from two revisions, run: - -```text -$ ./benchmark.py # defaults to using the most recent two revisions of benchmark data --------------------------------- 871c98630c98 to 38bb6f5ebe5f -------------------------------- - benchmark old (sec/run) new (sec/run) speedup (pvalue,nsamples) ----------------------------------------------------------------------------------------------- - simple diffusion (refine3): 0.408034 0.408034 ~ (p=0.996 n=36+36) - - simple diffusion (refine4): 1.554724 1.561682 ~ (p=0.571 n=10+10) - simple diffusion (refine5): 6.592326 6.592326 ~ (p=0.882 n=4+4) ----------------------------------------------------------------------------------------------- - -$ ./benchmark.py -old 44d2f34 -new 447b455 # or specify revisions to compare manually -------------------------------------- 44d2f34 to 447b455 ------------------------------------- - benchmark old (sec/run) new (sec/run) speedup (pvalue,nsamples) ----------------------------------------------------------------------------------------------- - simple diffusion (refine3): 0.416574 0.411435 -1.2% (p=0.000 n=37+37) - simple diffusion (refine4): 1.554724 1.497379 -3.7% (p=0.000 n=10+11) - simple diffusion (refine5): 6.553244 6.360004 -2.9% (p=0.030 n=4+4) ----------------------------------------------------------------------------------------------- -``` - -To generate visualizations, run: - -```text -$ ./scripts/benchmark.py --trends -``` - -This will generate an svg box plot for each benchmark over time/revision in a `trends` -subdirectory. An `index.html` file is also generated that embeds all the svg plots for -convenient viewing all together in a browser. - diff --git a/modules/doc/content/getting_started/examples_and_tutorials/tutorial01_app_development/step07_parallel.md b/modules/doc/content/getting_started/examples_and_tutorials/tutorial01_app_development/step07_parallel.md index c94c0812c394..7308f04cd300 100644 --- a/modules/doc/content/getting_started/examples_and_tutorials/tutorial01_app_development/step07_parallel.md +++ b/modules/doc/content/getting_started/examples_and_tutorials/tutorial01_app_development/step07_parallel.md @@ -69,8 +69,6 @@ There is an entire field of science about [!ac](HPC) and massively parallel proc !alert tip title=Try to target 20,000 [!ac](DOFs)-per-process. MOOSE developers tend to agree that 20,000 is the ideal number of [!ac](DOFs) that a single process may be responsible for. This value is reported as "`Num Local DOFs`" in the terminal printout at the beginning of every execution. There are, of course, some exceptions; if a problem exhibits speedup with less than 20,000 [!ac](DOFs)/process, then just use that. -*For more information about application performance, please visit the [application_development/performance_benchmarking.md] page.* - ## Demonstration To demonstrate the importance of parallel execution, the current Darcy pressure input file will be diff --git a/modules/doc/content/infrastructure/index.md b/modules/doc/content/infrastructure/index.md index 18f672117fa6..c57b56d0504f 100644 --- a/modules/doc/content/infrastructure/index.md +++ b/modules/doc/content/infrastructure/index.md @@ -9,4 +9,3 @@ of MOOSE and MOOSE-based applications: - [Python Tools](python/index.md) - [Build System](/build_system.md) - [Test System](/test_system.md) -- [Benchmarking](/performance_benchmarking.md) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 4bb442edf4ee..cc5bcdfb60e1 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -220,7 +220,7 @@ def __init__(self, argv, moose_dir, app_name=None, moose_python=None): # Finally load the plugins! self.factory.loadPlugins(dirs, 'testers', "IS_TESTER") - self._infiles = ['tests', 'speedtests'] + self._infiles = ['tests'] self.parse_errors = [] self.test_table = [] self.num_passed = 0 @@ -1119,13 +1119,6 @@ def checkAndUpdateCLArgs(self): print('Do not be an oxymoron with --verbose and --quiet') sys.exit(1) - # Flatten input_file_name from ['tests', 'speedtests'] to just tests if none supplied - # We can not support running two spec files during one launch into a third party queue manager. - # This is because Jobs created by spec files, have no way of accessing other jobs created by - # other spec files. They only know about the jobs a single spec file generates. - # NOTE: Which means, tests and speedtests running simultaneously currently have a chance to - # clobber each others output during normal operation!? - # Update any keys from the environment as necessary if not self.options.method: if 'METHOD' in os.environ: diff --git a/python/TestHarness/testers/bench.py b/python/TestHarness/testers/bench.py deleted file mode 100644 index 41799794409c..000000000000 --- a/python/TestHarness/testers/bench.py +++ /dev/null @@ -1,408 +0,0 @@ -#!/usr/bin/env python3 -#* This file is part of the MOOSE framework -#* https://www.mooseframework.org -#* -#* All rights reserved, see COPYRIGHT for full restrictions -#* https://github.com/idaholab/moose/blob/master/COPYRIGHT -#* -#* Licensed under LGPL 2.1, please see LICENSE for details -#* https://www.gnu.org/licenses/lgpl-2.1.html - -import subprocess -import time -import sys -import os -import gc -import shutil -import csv -import tempfile -import threading - -# try to import the resource module. We check further down if it failed -try: - import resource -except: - pass - -from TestHarness.testers.Tester import Tester - -def process_timeout(proc, timeout_sec): - kill_proc = lambda p: p.kill() - timer = threading.Timer(timeout_sec, kill_proc, [proc]) - try: - timer.start() - proc.wait() - finally: - timer.cancel() - -class Test: - def __init__(self, executable, infile, rootdir='.', args=None, perflog=False): - self.rootdir = rootdir - self.executable = executable - self.infile = infile - self.args = args - self.dur_secs = 0 - self.perflog = [] - self.getpot_options = ['Outputs/console=false', 'Outputs/exodus=false', 'Outputs/csv=false'] - self.have_perflog = perflog - if self.have_perflog: - self.getpot_options.append('UserObjects/perflog/type=PerflogDumper') - - def _buildcmd(self): - cmdpath = self.executable - infilepath = os.path.abspath(os.path.join(self.rootdir, self.infile)) - cmd = [cmdpath, '-i', infilepath] - if self.args is not None: - cmd.extend(self.args) - cmd.extend(self.getpot_options) - - # check for linux cpu isolation - isolpath = '/sys/devices/system/cpu/isolated' - cpuid = None - if os.path.exists(isolpath): - with open(isolpath, 'r') as f: - cpus = f.read().split(',') - if len(cpus[0].strip()) > 0: - cpuid = cpus[0] - if cpuid: - cmd = ['taskset', '-c', cpuid] + cmd - return cmd - - def reset(self): - self.perflog = [] - self.dur_secs = 0 - - def run(self, timer=None, timeout=300): - self.reset() - cmd = self._buildcmd() - - tmpdir = tempfile.mkdtemp() - shutil.rmtree(tmpdir, ignore_errors=True) - os.makedirs(tmpdir) - - rusage = resource.getrusage(resource.RUSAGE_CHILDREN) - start = rusage.ru_utime - gc.disable() - with open(os.devnull, 'w') as devnull: - if timer: - timer.start() - p = subprocess.Popen(cmd, cwd=tmpdir, stdout=devnull, stderr=devnull) - process_timeout(p, timeout) - if timer: - timer.stop() - gc.enable() - rusage = resource.getrusage(resource.RUSAGE_CHILDREN) - end = rusage.ru_utime - - if p.returncode != 0: - raise RuntimeError('command {} returned nonzero exit code'.format(cmd)) - - self.dur_secs = end - start - - # write perflog - if self.have_perflog: - with open(os.path.join(tmpdir, 'perflog.csv'), 'r') as csvfile: - reader = csv.reader(csvfile) - skip = True # use to skip header line - for row in reader: - if not skip: - self.perflog.append(row) - else: - skip = False - - shutil.rmtree(tmpdir) - - def getOutput(self): - # this has never produced any output :( - return '' - -class SpeedTest(Tester): - @staticmethod - def validParams(): - params = Tester.validParams() - params.addParam('input', 'The input file to use for this test.') - params.addParam('test_name', 'The name of the test - populated automatically') - params.addParam('cumulative_dur', 60, 'cumulative time (secs) to run each benchmark') - params.addParam('min_runs', 40, 'minimum number of runs for each benchmark') - params.addParam('max_runs', 400, 'maximum number of runs for each benchmark') - params.addParam('perflog', False, 'true to enable perflog and store its output') - return params - - def __init__(self, name, params): - Tester.__init__(self, name, params) - self.tags.append('speedtests') - self.timeout = max(3600, float(params['max_time'])) - self.check_only = False - - self.params = params - self.benchmark = None - self.db = os.environ.get('MOOSE_SPEED_DB', 'speedtests.sqlite') - - # override - def getMaxTime(self): - return self.timeout - - # override - def checkRunnable(self, options): - # check if resource is available - if 'resource' not in sys.modules: - return False - - # if user is not explicitly running benchmarks, we only run moose once and just check - # input - to make sure the benchmark isn't broken. - if 'speedtests' not in options.runtags: - self.params['max_runs'] = 1 - self.params['cli_args'].insert(0, '--check-input') - self.check_only = True - return True - - # override - def run(self, job, options, timer): - p = self.params - if not self.check_only and options.method not in ['opt', 'oprof', 'dbg']: - raise ValueError('cannot run benchmark with "' + options.method + '" build') - t = Test(p['executable'], p['input'], args=p['cli_args'], rootdir=p['test_dir'], perflog=p['perflog']) - - if self.check_only: - t.run(timer, timeout=p['max_time']) - return - - name = p['test_name'].split('.')[-1] - self.benchmark = Bench(name, test=t, cum_dur=float(p['cumulative_dur']), min_runs=int(p['min_runs']), max_runs=int(p['max_runs'])) - self.benchmark.run(timer, timeout=self.timeout) - with DB(self.db) as db: - db.store(self.benchmark) - - # override - def processResults(self, moose_dir, options, output): - self.setStatus(self.success) - return output - -class Bench: - def __init__(self, name, realruns=None, test=None, cum_dur=60, min_runs=40, max_runs=400): - self.name = name - self.test = test - self.realruns = [] - self.perflogruns = [] - if realruns is not None: - self.realruns.extend(realruns) - self._cum_dur = cum_dur - self._min_runs = min_runs - self._max_runs = max_runs - - def run(self, timer=None, timeout=3600): - tot = 0.0 - start = time.time() - while (len(self.realruns) < self._min_runs or tot < self._cum_dur) and len(self.realruns) < self._max_runs: - dt = time.time() - start - if dt >= timeout: - raise RuntimeError('benchmark timed out after {} with {} runs'.format(dt, len(self.realruns))) - - self.test.run(timer, timeout=timeout - dt) - self.realruns.append(self.test.dur_secs) - self.perflogruns.append(self.test.perflog) - tot += self.test.dur_secs - -class BenchComp: - def __init__(self, oldbench, newbench, psig=0.01): - self.name = oldbench.name - self.psig = psig - self.old = oldbench.realruns - self.new = newbench.realruns - - self.iqr_old = _iqr(self.old) - self.iqr_new = _iqr(self.new) - - from scipy.stats import mannwhitneyu - try: - result = mannwhitneyu(self.iqr_old, self.iqr_new, alternative='two-sided') - self.pvalue = result.pvalue - except: - self.pvalue = 1.0 - - self.u = result[0] - self.avg_old = float(sum(self.iqr_old))/len(self.iqr_old) - self.avg_new = float(sum(self.iqr_new))/len(self.iqr_new) - self.speed_change = (self.avg_new - self.avg_old) / self.avg_old - - @classmethod - def header(cls, revold, revnew): - oldstr, newstr = revold, revnew - if len(oldstr) > 12: - oldstr = oldstr[:12] - if len(newstr) > 12: - newstr = newstr[:12] - revstr = ' {} to {} '.format(oldstr, newstr) - revstr = revstr.center(30,'-') - return '' \ - + '--------------------------------{}--------------------------------'.format(revstr) \ - + '\n{:^30s} {:^15s} {:^15s} {:5s}'.format('benchmark', 'old (sec/run)', 'new (sec/run)', 'speedup (pvalue, nsamples)') \ - + '\n----------------------------------------------------------------------------------------------' - @classmethod - def footer(cls): - return '----------------------------------------------------------------------------------------------' - - def __str__(self): - name = self.name - if len(name) > 30: - name = name[:27] + '...' - if self.pvalue <= self.psig: - return '{:>30s}: {:^15f} {:^15f} {:+5.1f}% (p={:.4f},n={}+{})'.format(name, self.avg_old, self.avg_new, self.speed_change*100, self.pvalue, len(self.iqr_old), len(self.iqr_new)) - else: - return '{:>30s}: {:^15f} {:^15f} ~ (p={:.4f},n={}+{})'.format(name, self.avg_old, self.avg_new, self.pvalue, len(self.iqr_old), len(self.iqr_new)) - -def _iqr(a, frac=1000): - """return elements of a within frac*iqr of the the interquartile range (inclusive)""" - import numpy - qup, qlow = numpy.percentile(a, [75 ,25]) - - iqr = qup - qlow - clean = [] - for val in a: - if qlow - frac*iqr <= val and val <= qup + frac*iqr: - clean.append(val) - return clean - -class DB: - def __init__(self, fname): - CREATE_BENCH_TABLE = '''CREATE TABLE IF NOT EXISTS benchmarks - ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - name TEXT, - executable TEXT, - executable_name TEXT, - executable_method TEXT, - input_file TEXT, - timestamp INTEGER, - revision TEXT, - date INTEGER, - load REAL - );''' - - CREATE_TIMES_TABLE = '''CREATE TABLE IF NOT EXISTS timings - ( - benchmark_id INTEGER, - run INTEGER, - realtime_secs REAL - );''' - - CREATE_PERFLOG_TABLE = '''CREATE TABLE IF NOT EXISTS perflog - ( - benchmark_id INTEGER, - run INTEGER, - field TEXT, - subfield TEXT, - exec_count INTEGER, - self_time_secs REAL, - cum_time_secs REAL - );''' - - self.fname = fname - - # python might not have sqlite3 builtin, so do the import here so - # that the TestHarness can always import this file - import sqlite3 - self.conn = sqlite3.connect(fname) - c = self.conn.cursor() - c.execute(CREATE_BENCH_TABLE) - c.execute(CREATE_TIMES_TABLE) - c.execute(CREATE_PERFLOG_TABLE) - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def revisions(self, method='opt'): - c = self.conn.cursor() - c.execute('SELECT revision,date FROM benchmarks WHERE executable_method=? GROUP BY revision ORDER BY date ASC', (method,)) - rows = c.fetchall() - revs = [] - times = [] - for r in rows: - revs.append(r[0]) - times.append(r[1]) - return revs, times - - def bench_names(self, method='opt'): - c = self.conn.cursor() - c.execute('SELECT DISTINCT name FROM benchmarks WHERE executable_method=?', (method,)) - rows = c.fetchall() - names = [] - for r in rows: - names.append(r[0]) - return names - - def list(self, revision, benchmark='', method='opt'): - c = self.conn.cursor() - if benchmark == '': - c.execute('SELECT id,name,executable,input_file FROM benchmarks WHERE INSTR(revision,?) AND executable_method=? ORDER BY date ASC', (revision,method)) - else: - c.execute('SELECT id,name,executable,input_file FROM benchmarks WHERE INSTR(revision,?) AND name=? AND executable_method=? ORDER BY date ASC', (revision,benchmark,method)) - benchmarks = c.fetchall() - return benchmarks - - def load_times(self, bench_id): - c = self.conn.cursor() - c.execute('SELECT realtime_secs FROM timings WHERE benchmark_id=?', (bench_id,)) - ents = c.fetchall() - real = [] - for ent in ents: - real.append(float(ent[0])) - return real - - def load(self, revision, bench_name, method='opt'): - """loads and returns a Bench object for the given revision and benchmark name""" - entries = self.list(revision, benchmark=bench_name, method=method) - if len(entries) < 1: - raise RuntimeError('load: no benchamrk for revision="{}",bench_name="{}"'.format(revision, bench_name)) - b = entries[0] - real = self.load_times(b[0]) - return Bench(b[1], test=Test(b[2], b[3]), realruns=real) - - def store(self, benchmark, rev=None): - """stores a (run/executed) Bench in the database. if rev is None, git revision is retrieved from git""" - ex = benchmark.test.executable - (ex_name, ex_method) = os.path.basename(ex).rsplit('-', 1) - infile = benchmark.test.infile - timestamp = time.time() - date = timestamp - if rev is None: - if 'MOOSE_REVISION' in os.environ: - rev = os.environ['MOOSE_REVISION'] - else: - rev, date = git_revision() - load = os.getloadavg()[0] - - c = self.conn.cursor() - c.execute('INSERT INTO benchmarks (name,executable,executable_name,executable_method,input_file,timestamp,revision,date,load) VALUES (?,?,?,?,?,?,?,?,?)', - (benchmark.name, ex, ex_name, ex_method, infile, timestamp, rev, date, load)) - bench_id = c.lastrowid - self.conn.commit() - - i = 0 - for real, perflog in zip(benchmark.realruns, benchmark.perflogruns): - c.execute('INSERT INTO timings (benchmark_id, run, realtime_secs) VALUES (?,?,?)', (bench_id, i, real)) - i += 1 - for entry in perflog: - cat, subcat, nruns, selftime, cumtime = entry - c.execute('INSERT INTO perflog (benchmark_id, run, field, subfield, exec_count, self_time_secs, cum_time_secs) VALUES (?,?,?,?,?,?,?)', - (bench_id, i, cat, subcat, nruns, selftime, cumtime)) - - return bench_id - - def close(self): - self.conn.commit() - self.conn.close() - -def git_revision(dir='.'): - # return hash and (unix secs since epoch) date - cmd = ['git', 'log', '--date', 'raw', '--pretty=format:%H %ad', '-n', '1'] - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, cwd=dir) - stdout, stderr = p.communicate() - if p.returncode != 0: - raise RuntimeError('failed to retrieve git revision') - commit = str(stdout).strip().split(' ')[0] - date = int(str(stdout).strip().split(' ')[1]) - return commit, date diff --git a/test/tests/kernels/simple_diffusion/speedtests b/test/tests/kernels/simple_diffusion/speedtests deleted file mode 100644 index b3bb04633781..000000000000 --- a/test/tests/kernels/simple_diffusion/speedtests +++ /dev/null @@ -1,17 +0,0 @@ -[Benchmarks] - [diffusion_100x100] - type = SpeedTest - input = simple_diffusion.i - cli_args = 'Mesh/nx=100 Mesh/ny=100' - [] - [diffusion_200x200] - type = SpeedTest - input = simple_diffusion.i - cli_args = 'Mesh/nx=200 Mesh/ny=200' - [] - [uniform_refine_4] - type = SpeedTest - input = simple_diffusion.i - cli_args = 'Mesh/uniform_refine=4' - [] -[] diff --git a/test/tests/kernels/simple_transient_diffusion/speedtests b/test/tests/kernels/simple_transient_diffusion/speedtests deleted file mode 100644 index 004804b52054..000000000000 --- a/test/tests/kernels/simple_transient_diffusion/speedtests +++ /dev/null @@ -1,13 +0,0 @@ -[Benchmarks] - [./trans_diffusion_100x100_t5] - type = SpeedTest - input = simple_transient_diffusion.i - cli_args = 'Mesh/nx=100 Mesh/ny=100 Executioner/num_steps=5' - [../] - [./trans_diffusion_100x100_t10] - type = SpeedTest - input = simple_transient_diffusion.i - cli_args = 'Mesh/nx=100 Mesh/ny=100 Executioner/num_steps=10' - prereq = 'trans_diffusion_100x100_t5' - [../] -[] From 93a950dfe445ff938db4c0740192dea9da89b525 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 7 May 2024 12:23:25 -0600 Subject: [PATCH 004/243] Revert to old behavior for getCommand() refs #27562 --- python/TestHarness/TestHarness.py | 6 +----- python/TestHarness/runners/SubprocessRunner.py | 6 ++---- python/TestHarness/schedulers/RunPBS.py | 14 ++++++++++++-- python/TestHarness/testers/AnalyzeJacobian.py | 2 +- python/TestHarness/testers/CheckFiles.py | 2 +- python/TestHarness/testers/PythonUnitTest.py | 2 +- python/TestHarness/testers/RunApp.py | 17 ++++++++--------- python/TestHarness/testers/RunCommand.py | 2 +- python/TestHarness/testers/Tester.py | 6 +----- 9 files changed, 28 insertions(+), 29 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index cc5bcdfb60e1..889db6c9ca75 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -603,11 +603,7 @@ def printOutput(self, job, color): if job.getCommandRan(): command = job.getCommandRan() else: - cmd, mpi_cmd = job.getCommand() - command = '' - if mpi_cmd: - command += f'{mpi_cmd} ' - command += cmd + command = job.getCommand() output = 'Working Directory: ' + job.getTestDir() + '\nRunning command: ' + command + '\n' output += util.trimOutput(job, self.options) output = output.replace('\r', '\n') # replace the carriage returns with newlines diff --git a/python/TestHarness/runners/SubprocessRunner.py b/python/TestHarness/runners/SubprocessRunner.py index 43b78b698159..bcefcd2b0fab 100644 --- a/python/TestHarness/runners/SubprocessRunner.py +++ b/python/TestHarness/runners/SubprocessRunner.py @@ -32,9 +32,7 @@ def __init__(self, job, options): def spawn(self, timer): tester = self.job.getTester() use_shell = tester.specs["use_shell"] - cmd, mpi_cmd = tester.getCommand(self.options) - if mpi_cmd is not None: - cmd = f'{mpi_cmd} {cmd}' + cmd = tester.getCommand(self.options) tester.setCommandRan(cmd) # Split command into list of args to be passed to Popen @@ -59,7 +57,7 @@ def spawn(self, timer): process_kwargs['preexec_fn'] = os.setsid # Special logic for openmpi runs - if self.hasOpenMPI(): + if tester.hasOpenMPI(): process_env = os.environ.copy() # Don't clobber state diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 6f31361f684e..71e7d1693512 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -146,6 +146,13 @@ def getPBSJobSubmissionPath(self, job): """Gets the aboslute path for the qsub script for a PBS job""" return self.getPBSJobOutputPathPrefix(job) + '.qsub' + @staticmethod + def parseMPICommand(command): + find_mpi = re.search('^mpiexec -n [0-9]+ ', command) + if find_mpi is not None: + return find_mpi.group(0) + return None + def submitJob(self, job): """Submits a PBS job""" tester = job.getTester() @@ -162,10 +169,13 @@ def submitJob(self, job): # Set up the command. We have special logic here for when we're using apptainer, # where we need to put the MPI command outside of the apptainer call - command, mpi_command = tester.getCommand(options) full_command = '' + command = tester.getCommand(options) + mpi_command = self.parseMPICommand(command) if mpi_command: - full_command += f'{mpi_command} ' + command = command.replace(mpi_command, '') + full_command += mpi_command + APPTAINER_CONTAINER = os.environ.get('APPTAINER_CONTAINER') if APPTAINER_CONTAINER: apptainer_cmd = f'apptainer exec {APPTAINER_CONTAINER}' diff --git a/python/TestHarness/testers/AnalyzeJacobian.py b/python/TestHarness/testers/AnalyzeJacobian.py index 50942864ccfa..da4544937a17 100644 --- a/python/TestHarness/testers/AnalyzeJacobian.py +++ b/python/TestHarness/testers/AnalyzeJacobian.py @@ -68,7 +68,7 @@ def getCommand(self, options): if len(specs['cli_args']): command += '--cli-args "' + (' '.join(specs['cli_args']) + '"') - return command, None + return command def processResults(self, moose_dir, options, output): diff --git a/python/TestHarness/testers/CheckFiles.py b/python/TestHarness/testers/CheckFiles.py index 6cbae7e2f763..070957dfa7c7 100644 --- a/python/TestHarness/testers/CheckFiles.py +++ b/python/TestHarness/testers/CheckFiles.py @@ -83,4 +83,4 @@ def checkRunnable(self, options): self.setStatus(self.skip) return False - return FileTester.checkRunnable(self, options) + return super().checkRunnable(options) diff --git a/python/TestHarness/testers/PythonUnitTest.py b/python/TestHarness/testers/PythonUnitTest.py index 4ff1f301861c..d64936dc131a 100644 --- a/python/TestHarness/testers/PythonUnitTest.py +++ b/python/TestHarness/testers/PythonUnitTest.py @@ -47,4 +47,4 @@ def getCommand(self, options): cmd = "python3 -m unittest" + use_buffer + "-v " + test_case # We need to append PYTHONPATH here for running these within apptainer - return f'PYTHONPATH={self.getMooseDir()}/python ' + cmd + ' '.join(self.specs['cli_args']), None + return f'PYTHONPATH={self.getMooseDir()}/python ' + cmd + ' '.join(self.specs['cli_args']) diff --git a/python/TestHarness/testers/RunApp.py b/python/TestHarness/testers/RunApp.py index 80074d2f0820..2da2a0596904 100644 --- a/python/TestHarness/testers/RunApp.py +++ b/python/TestHarness/testers/RunApp.py @@ -225,16 +225,15 @@ def getCommand(self, options): elif nthreads > 1: command = command + ' --n-threads=' + str(nthreads) - mpi_command = None if self.force_mpi or options.parallel or ncpus > 1: - # Arbitrary proxy command, but keep track of the command so that someone could use it later - if specs.isValid('command_proxy'): - raise Exception('no worky yet') - # command = command.replace('"', r'\"') - # command = f'RUNAPP_COMMAND="{command}" {os.path.join(specs["test_dir"], specs["command_proxy"])}' - mpi_command = f'{self.mpi_command} -n {ncpus}' - - return command, mpi_command + command = f'{self.mpi_command} -n {ncpus} {command}' + + # Arbitrary proxy command, but keep track of the command so that someone could use it later + if specs.isValid('command_proxy'): + command = command.replace('"', r'\"') + return f'RUNAPP_COMMAND="{command}" {os.path.join(specs["test_dir"], specs["command_proxy"])}' + + return command def testFileOutput(self, moose_dir, options, output): """ Set a failure status for expressions found in output """ diff --git a/python/TestHarness/testers/RunCommand.py b/python/TestHarness/testers/RunCommand.py index a380fa39bb14..ff5e888c18b2 100644 --- a/python/TestHarness/testers/RunCommand.py +++ b/python/TestHarness/testers/RunCommand.py @@ -24,7 +24,7 @@ def __init__(self, name, params): def getCommand(self, options): # Create the command line string to run - return self.command, None + return self.command def processResults(self, moose_dir, options, output): if self.getExitCode() == 77 : diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index 63e77f3891e3..a19cb67267a3 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -382,12 +382,8 @@ def getCommand(self, options): within the command, for example when running within a container. Due to this distinction, you can obtain the command that was actually ran via getCommandRan() - - The first value is the argument without a parallel executor - (something like mpiexec -n ...) and the second value is - the parallel argument (if any, otherwise None) """ - return None, None + return None def setCommandRan(self, command): """ From 3110be9e88a92706f3008e2f2eeef560098b198c Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 7 May 2024 12:23:39 -0600 Subject: [PATCH 005/243] Remove unused include --- python/TestHarness/schedulers/RunPBS.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 71e7d1693512..067244704b04 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -13,9 +13,7 @@ from timeit import default_timer as clock from PBScodes import * import paramiko - import jinja2 -from jinja2 import meta ## This Class is responsible for maintaining an interface to the PBS scheduling syntax class RunPBS(RunParallel): From f7d649b9301d2bff931446ef885dbb83fc93d482 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 7 May 2024 12:23:55 -0600 Subject: [PATCH 006/243] Skip unsupported capability refs #27562 --- python/TestHarness/testers/RunApp.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/TestHarness/testers/RunApp.py b/python/TestHarness/testers/RunApp.py index 2da2a0596904..f3dc5a0e3845 100644 --- a/python/TestHarness/testers/RunApp.py +++ b/python/TestHarness/testers/RunApp.py @@ -119,6 +119,11 @@ def checkRunnable(self, options): self.setStatus(self.skip) return False + if options.pbs and self.specs.isValid('command_proxy') and os.environ.get('APPTAINER_CONTAINER') is not None: + self.addCaveats('PBS NOT SUPPORTED') + self.setStatus(self.skip) + return False + return True def getThreads(self, options): From 64592508ad09cdaa34d1fc12bc83bb3cff1acca4 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 7 May 2024 12:34:51 -0600 Subject: [PATCH 007/243] Fix mpi command parse --- python/TestHarness/schedulers/RunPBS.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 067244704b04..4a89976e0478 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -146,9 +146,9 @@ def getPBSJobSubmissionPath(self, job): @staticmethod def parseMPICommand(command): - find_mpi = re.search('^mpiexec -n [0-9]+ ', command) + find_mpi = re.search('^(mpiexec -n [0-9]+ )', command) if find_mpi is not None: - return find_mpi.group(0) + return find_mpi.group(1) return None def submitJob(self, job): From 9841d0f200900bb4359e9bf5e53b340f3e797ff3 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 7 May 2024 12:58:13 -0600 Subject: [PATCH 008/243] Fix quotes --- python/TestHarness/schedulers/RunPBS.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 4a89976e0478..b38e4f766410 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -168,7 +168,7 @@ def submitJob(self, job): # Set up the command. We have special logic here for when we're using apptainer, # where we need to put the MPI command outside of the apptainer call full_command = '' - command = tester.getCommand(options) + command = tester.getCommand(options).replace('"', "'") mpi_command = self.parseMPICommand(command) if mpi_command: command = command.replace(mpi_command, '') @@ -176,10 +176,7 @@ def submitJob(self, job): APPTAINER_CONTAINER = os.environ.get('APPTAINER_CONTAINER') if APPTAINER_CONTAINER: - apptainer_cmd = f'apptainer exec {APPTAINER_CONTAINER}' - full_command += f'{apptainer_cmd} ' - # The double quotes around the exec command here are important as apptainer exec - # doesn't work well with our command line arguments for some reason + full_command += f'apptainer exec {APPTAINER_CONTAINER} ' full_command += f'"{command}"' num_procs = tester.getProcs(options) From d689e33a265a8ab52d422d2d835535a7900adaa1 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 7 May 2024 13:05:47 -0600 Subject: [PATCH 009/243] Only use terminal_size if we can --- python/TestHarness/util.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/TestHarness/util.py b/python/TestHarness/util.py index e42832f4d174..1dd67b6433b8 100644 --- a/python/TestHarness/util.py +++ b/python/TestHarness/util.py @@ -15,7 +15,13 @@ import yaml import sys -TERM_COLS = int(os.getenv('MOOSE_TERM_COLS', os.get_terminal_size().columns * 5/6)) +DEFAULT_TERM_COLS = None +try: + DEFAULT_TERM_COLS = os.get_terminal_size().columns * 5/6 +except: + DEFAULT_TERM_COLS = 110 + pass +TERM_COLS = int(os.getenv('MOOSE_TERM_COLS', DEFAULT_TERM_COLS)) TERM_FORMAT = os.getenv('MOOSE_TERM_FORMAT', 'njcst') MOOSE_OPTIONS = { From 5412c0de23644f97069ad1d49fb3844babeecc48 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 7 May 2024 13:06:01 -0600 Subject: [PATCH 010/243] Remove extraneous quotes --- test/tests/postprocessors/find_value_on_line/tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/tests/postprocessors/find_value_on_line/tests b/test/tests/postprocessors/find_value_on_line/tests index 84cd58ccc07b..d5f2b216e27d 100644 --- a/test/tests/postprocessors/find_value_on_line/tests +++ b/test/tests/postprocessors/find_value_on_line/tests @@ -67,7 +67,7 @@ [./line_out_of_bounds] type = 'RunException' input = 'findvalueonline.i' - cli_args = "\"Postprocessors/pos/end_point='11 0 0' Outputs/csv=false\"" + cli_args = "Postprocessors/pos/end_point='11 0 0' Outputs/csv=false" expect_err = "No element found at the current search point" requirement = 'The system shall report and error when the sampling line extends beyond the mesh bounding box.' From b49de567bf7b9d386e424ceff113583b7c71a2cc Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 7 May 2024 13:11:19 -0600 Subject: [PATCH 011/243] Skip unit tests on PBS --- python/TestHarness/testers/PythonUnitTest.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/TestHarness/testers/PythonUnitTest.py b/python/TestHarness/testers/PythonUnitTest.py index d64936dc131a..41763d8b5a01 100644 --- a/python/TestHarness/testers/PythonUnitTest.py +++ b/python/TestHarness/testers/PythonUnitTest.py @@ -48,3 +48,11 @@ def getCommand(self, options): # We need to append PYTHONPATH here for running these within apptainer return f'PYTHONPATH={self.getMooseDir()}/python ' + cmd + ' '.join(self.specs['cli_args']) + + def checkRunnable(self, options): + if options.pbs: + self.addCaveats('PBS NOT SUPPORTED') + self.setStatus(self.skip) + return False + + return super().checkRunnable(options) From 15e1528df5e18ff5fd744697068732fd0032e976 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 7 May 2024 13:35:40 -0600 Subject: [PATCH 012/243] Load the contents into the submission instead of source --- python/TestHarness/schedulers/RunPBS.py | 9 ++++++++- python/TestHarness/schedulers/pbs_template | 6 +++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index b38e4f766410..21ceb6e04e87 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -94,6 +94,11 @@ def __init__(self, harness, params): print(f'ERROR: --pbs-pre-source path {self.options.queue_source_command} does not exist') sys.exit(1) + # Load the pre-source if it exists + self.source_contents = None + if self.options.queue_source_command: + self.source_contents = open(self.options.queue_source_command, 'r').read() + class CallPBSException(Exception): """Exception class for providing extra context for PBS submission errors""" def __init__(self, run_pbs, description, command, result=None): @@ -197,7 +202,9 @@ def submitJob(self, job): if self.options.queue_queue: template_env['QUEUE'] = self.options.queue_queue if self.options.queue_source_command: - template_env['SOURCE_COMMAND'] = self.options.queue_source_command + template_env['SOURCE_FILE'] = self.options.queue_source_command + if self.source_contents: + template_env['SOURCE_CONTENTS'] = self.source_contents # Build the script jinja_env = jinja2.Environment() diff --git a/python/TestHarness/schedulers/pbs_template b/python/TestHarness/schedulers/pbs_template index 98126ec1a151..e5ffbc379396 100644 --- a/python/TestHarness/schedulers/pbs_template +++ b/python/TestHarness/schedulers/pbs_template @@ -10,9 +10,9 @@ #PBS -o {{ OUTPUT }} #PBS -l place={{ PLACE }} -{%- if SOURCE_COMMAND is defined %} -# Set by the --pbs-pre-source TestHarness option -source {{ SOURCE_COMMAND }} || exit $? +{%- if SOURCE_FILE is defined %} +# Loaded from {{ SOURCE_FILE }} +{{ SOURCE_CONTENTS }} {%- endif %} # Print a useful header From 11ee967d72a43b0211c68a0f03aafbdb3828776a Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 7 May 2024 13:48:58 -0600 Subject: [PATCH 013/243] Catch early errors --- python/TestHarness/schedulers/pbs_template | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/TestHarness/schedulers/pbs_template b/python/TestHarness/schedulers/pbs_template index e5ffbc379396..8b5b638845f0 100644 --- a/python/TestHarness/schedulers/pbs_template +++ b/python/TestHarness/schedulers/pbs_template @@ -10,6 +10,8 @@ #PBS -o {{ OUTPUT }} #PBS -l place={{ PLACE }} +set -e + {%- if SOURCE_FILE is defined %} # Loaded from {{ SOURCE_FILE }} {{ SOURCE_CONTENTS }} @@ -30,6 +32,8 @@ echo "########################################################################## # Move into the test directory cd {{ CWD }} +set +e + # Run the command {{ COMMAND }} # ...and capture the return code cause we're not done yet From 38487447fa562c0316943358a6f44bee0de611d5 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 7 May 2024 14:10:48 -0600 Subject: [PATCH 014/243] Correct callback --- python/TestHarness/testers/SignalTester.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/testers/SignalTester.py b/python/TestHarness/testers/SignalTester.py index e2fa81b469c3..bcbbd97f37ef 100644 --- a/python/TestHarness/testers/SignalTester.py +++ b/python/TestHarness/testers/SignalTester.py @@ -44,7 +44,7 @@ def checkRunnable(self, options): self.setStatus(self.skip) return False - return super().checkRunnable(self, options) + return super().checkRunnable(options) def postSpawn(self): self._runner.sendSignal(self.signal) From e9a435f7f71c34cb7655a37b0236c21b7578c0ae Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 7 May 2024 19:49:58 -0600 Subject: [PATCH 015/243] Check for shouldExecute() now because exit code could be None --- python/TestHarness/testers/RunApp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/testers/RunApp.py b/python/TestHarness/testers/RunApp.py index f3dc5a0e3845..7205fb67ba5f 100644 --- a/python/TestHarness/testers/RunApp.py +++ b/python/TestHarness/testers/RunApp.py @@ -321,7 +321,7 @@ def testExitCodes(self, moose_dir, options, output): reason = 'ERRMSG' elif self.getExitCode() == 0 and specs['should_crash'] == True: reason = 'NO CRASH' - elif self.getExitCode() != 0 and specs['should_crash'] == False: + elif self.getExitCode() != 0 and specs['should_crash'] == False and self.shouldExecute(): # Let's look at the error code to see if we can perhaps further split this out later with a post exam reason = 'CRASH' # Valgrind runs From 96615896874e4f9a3b3c0e0da67ae92a3b0bd809 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 7 May 2024 19:55:06 -0600 Subject: [PATCH 016/243] Add missing postSpawn call --- python/TestHarness/testers/Tester.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index a19cb67267a3..a15c64011b78 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -435,6 +435,9 @@ def run(self, job, options, timer): except Exception as e: raise Exception('Failed to spawn process') from e + # Entry point for testers to do other things + self.postSpawn() + # And wait for it to complete self._runner.wait(timer) From 52aea36d180807fc1168523c68592fceffa80998 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 7 May 2024 20:00:38 -0600 Subject: [PATCH 017/243] Only skip for parallel apptainer --- python/TestHarness/testers/PythonUnitTest.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/TestHarness/testers/PythonUnitTest.py b/python/TestHarness/testers/PythonUnitTest.py index 41763d8b5a01..e5e3dbca01de 100644 --- a/python/TestHarness/testers/PythonUnitTest.py +++ b/python/TestHarness/testers/PythonUnitTest.py @@ -50,8 +50,10 @@ def getCommand(self, options): return f'PYTHONPATH={self.getMooseDir()}/python ' + cmd + ' '.join(self.specs['cli_args']) def checkRunnable(self, options): - if options.pbs: - self.addCaveats('PBS NOT SUPPORTED') + # Can't run within apptainer in parallel because mpiexec needs to be + # executed outside of the apptainer call + if os.environ.get('APPTAINER_CONTAINER') and self.getProcs(options) > 1: + self.addCaveats('PARALLEL APPTAINER') self.setStatus(self.skip) return False From 8f20c53218c1ef2d227db09e9c22d89d582b445d Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 7 May 2024 20:01:00 -0600 Subject: [PATCH 018/243] Set MOOSE PYTHONPATH in the template --- python/TestHarness/schedulers/RunPBS.py | 6 +++++- python/TestHarness/schedulers/pbs_template | 3 +++ python/TestHarness/testers/PythonUnitTest.py | 3 +-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 21ceb6e04e87..9e2f62f621e2 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -188,6 +188,9 @@ def submitJob(self, job): num_threads = tester.getThreads(options) walltime = str(datetime.timedelta(seconds=tester.getMaxTime())) + # Add MOOSE's python path for python scripts + moose_python = os.path.abspath(os.path.join(os.path.abspath(os.path.dirname(__file__)), '../../..')) + # Set up the template template_env = {'NAME': self.getPBSJobName(job), 'SELECT': f'{num_procs}:mpiprocs=1:ncpus={num_threads}', @@ -198,7 +201,8 @@ def submitJob(self, job): 'SUBMITTED_HOSTNAME': socket.gethostname(), 'CWD': tester.getTestDir(), 'COMMAND': full_command, - 'ENDING_COMMENT': self.getOutputEndingComment()} + 'ENDING_COMMENT': self.getOutputEndingComment(), + 'MOOSE_PYTHONPATH': moose_python} if self.options.queue_queue: template_env['QUEUE'] = self.options.queue_queue if self.options.queue_source_command: diff --git a/python/TestHarness/schedulers/pbs_template b/python/TestHarness/schedulers/pbs_template index 8b5b638845f0..c4c1715237c0 100644 --- a/python/TestHarness/schedulers/pbs_template +++ b/python/TestHarness/schedulers/pbs_template @@ -17,6 +17,9 @@ set -e {{ SOURCE_CONTENTS }} {%- endif %} +# Add MOOSE's python path for python scripts +export PYTHONPATH={{ MOOSE_PYTHONPATH }}:${PYTHONPATH} + # Print a useful header echo "################################################################################" echo "Beginning TestHarness RunPBS job" diff --git a/python/TestHarness/testers/PythonUnitTest.py b/python/TestHarness/testers/PythonUnitTest.py index e5e3dbca01de..645c3bda8432 100644 --- a/python/TestHarness/testers/PythonUnitTest.py +++ b/python/TestHarness/testers/PythonUnitTest.py @@ -46,8 +46,7 @@ def getCommand(self, options): else: cmd = "python3 -m unittest" + use_buffer + "-v " + test_case - # We need to append PYTHONPATH here for running these within apptainer - return f'PYTHONPATH={self.getMooseDir()}/python ' + cmd + ' '.join(self.specs['cli_args']) + return cmd + ' '.join(self.specs['cli_args']) def checkRunnable(self, options): # Can't run within apptainer in parallel because mpiexec needs to be From ab025136d2ff36963e0c097e650759e4cf248530 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 7 May 2024 20:08:45 -0600 Subject: [PATCH 019/243] Use the right path --- python/TestHarness/schedulers/RunPBS.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 9e2f62f621e2..f7f7c59da094 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -189,7 +189,7 @@ def submitJob(self, job): walltime = str(datetime.timedelta(seconds=tester.getMaxTime())) # Add MOOSE's python path for python scripts - moose_python = os.path.abspath(os.path.join(os.path.abspath(os.path.dirname(__file__)), '../../..')) + moose_python = os.path.abspath(os.path.join(os.path.abspath(os.path.dirname(__file__)), '../..')) # Set up the template template_env = {'NAME': self.getPBSJobName(job), From 05a9d0fe1aac1883b872b46e75d5acf6e28d36a2 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 7 May 2024 20:25:08 -0600 Subject: [PATCH 020/243] Forcefully set the output when the job dies early --- python/TestHarness/schedulers/Job.py | 4 ++-- python/TestHarness/schedulers/RunParallel.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 0c1198a81431..fb1052c2d9ca 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -276,9 +276,9 @@ def getOutputFile(self): 'txt'])) return os.path.join(output_dir, output_file) - def setOutput(self, output): + def setOutput(self, output, force=False): """ Method to allow schedulers to overwrite the output if certain conditions are met """ - if not self.__tester.isOutputReady(): + if not self.__tester.isOutputReady() and not force: return # Check for invalid unicode in output diff --git a/python/TestHarness/schedulers/RunParallel.py b/python/TestHarness/schedulers/RunParallel.py index b1f75206692b..62673b37eb8e 100644 --- a/python/TestHarness/schedulers/RunParallel.py +++ b/python/TestHarness/schedulers/RunParallel.py @@ -89,6 +89,9 @@ def run(self, job): except Exception: output += 'Python exception encountered:\n\n' + traceback.format_exc() tester.setStatus(StatusSystem().error, 'TESTER EXCEPTION') + # Forcefully set the output here because it might have not initialized + # because the job might not have even run + job.setOutput(output, force=True) # Clean up now that we're done tester.cleanup() From a5abf3dd4178233c40df1a74afd7ab1b539f5e95 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 8 May 2024 10:02:13 -0600 Subject: [PATCH 021/243] Try to find a key to use --- python/TestHarness/schedulers/RunPBS.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index f7f7c59da094..f6043fb41552 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -73,10 +73,21 @@ def __init__(self, harness, params): self.pbs_ssh_lock = None # Setup the jump host if provided if self.pbs_ssh_host: + # Try to find a key to use + key_filename = None + try: + ssh_config = os.path.expanduser('~/.ssh/config') + config = paramiko.SSHConfig.from_path(ssh_config).lookup(self.pbs_ssh_host) + identityfile = config.get('identityfile') + if identityfile is not None and len(identityfile) > 0: + key_filename = identityfile[-1] + except: + pass + self.pbs_ssh_lock = threading.Lock() self.pbs_ssh = paramiko.SSHClient() self.pbs_ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - self.pbs_ssh.connect(self.pbs_ssh_host) + self.pbs_ssh.connect(self.pbs_ssh_host, key_filename=key_filename) # Load the PBS template template_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'pbs_template') From 9387f45233d4f59aa62c9b2cd2cba9968a936dc8 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 8 May 2024 12:46:20 -0600 Subject: [PATCH 022/243] Remove benchmarking link --- python/doc/content/python/TestHarness.md | 1 - 1 file changed, 1 deletion(-) diff --git a/python/doc/content/python/TestHarness.md b/python/doc/content/python/TestHarness.md index 5fb50931d86d..6609ad163640 100644 --- a/python/doc/content/python/TestHarness.md +++ b/python/doc/content/python/TestHarness.md @@ -89,7 +89,6 @@ Tester system is completely pluggable and extendable. The list of default tester - [AnalyzeJacobian](AnalyzeJacobian.md) - [PetscJacobianTester](PetscJacobianTester.md) - [PythonUnitTest](PythonUnitTest.md) (includes [Method of Manufactured solutions](python/mms.md) testing) -- [BenchmarkTesting](application_development/performance_benchmarking.md) ## Test Specifications From 7c3a10f93621bf6654d3cd23212547f1ad41c4e4 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 8 May 2024 12:47:18 -0600 Subject: [PATCH 023/243] Call correct member --- python/TestHarness/runners/SubprocessRunner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/runners/SubprocessRunner.py b/python/TestHarness/runners/SubprocessRunner.py index bcefcd2b0fab..d15ca9140da2 100644 --- a/python/TestHarness/runners/SubprocessRunner.py +++ b/python/TestHarness/runners/SubprocessRunner.py @@ -61,7 +61,7 @@ def spawn(self, timer): process_env = os.environ.copy() # Don't clobber state - process_env['OMPI_MCA_orte_tmpdir_base'] = self.getTempDirectory().name + process_env['OMPI_MCA_orte_tmpdir_base'] = self.getTester().getTempDirectory().name # Allow oversubscription for hosts that don't have a hostfile process_env['PRTE_MCA_rmaps_default_mapping_policy'] = ':oversubscribe' From 6171b20d53193d9bf5d65634616026e59af8543c Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 8 May 2024 13:37:42 -0600 Subject: [PATCH 024/243] Fix parameter quotes --- modules/ray_tracing/test/tests/actions/add_raybc_action/tests | 2 +- .../ray_tracing/test/tests/actions/add_raykernel_action/tests | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/ray_tracing/test/tests/actions/add_raybc_action/tests b/modules/ray_tracing/test/tests/actions/add_raybc_action/tests index ea1d3f6c6024..bd615011f36c 100644 --- a/modules/ray_tracing/test/tests/actions/add_raybc_action/tests +++ b/modules/ray_tracing/test/tests/actions/add_raybc_action/tests @@ -29,7 +29,7 @@ [multiple] type = RunException input = 'add_raybc_action.i' - cli_args = 'RayBCs/active=multiple_studies "UserObjects/active=\'study another_study\'"' + cli_args = 'RayBCs/active=multiple_studies UserObjects/active="study another_study"' expect_err = "While constructing the NullRayBC 'multiple_studies', multiple RayTracingStudy objects were found." allow_test_objects = true diff --git a/modules/ray_tracing/test/tests/actions/add_raykernel_action/tests b/modules/ray_tracing/test/tests/actions/add_raykernel_action/tests index 9840e54d6f1f..5135b40bfe9a 100644 --- a/modules/ray_tracing/test/tests/actions/add_raykernel_action/tests +++ b/modules/ray_tracing/test/tests/actions/add_raykernel_action/tests @@ -29,7 +29,7 @@ [multiple] type = RunException input = 'add_raykernel_action.i' - cli_args = 'RayKernels/active=multiple_studies "UserObjects/active=\'study another_study\'"' + cli_args = 'RayKernels/active=multiple_studies UserObjects/active="study another_study"' expect_err = "While constructing the NullRayKernel 'multiple_studies', multiple RayTracingStudy objects were found." allow_test_objects = true From f438717ca52c4c050a15bfd1014d66f7928b472d Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 8 May 2024 14:38:06 -0600 Subject: [PATCH 025/243] Add search for testers from app scripts --- python/TestHarness/TestHarness.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 889db6c9ca75..74173c3d8726 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -208,14 +208,21 @@ def __init__(self, argv, moose_dir, app_name=None, moose_python=None): # Build a Warehouse to hold the MooseObjects self.warehouse = Warehouse() - # Get dependant applications and load dynamic tester plugins - # If applications have new testers, we expect to find them in /scripts/TestHarness/testers + # Testers from this directory dirs = [os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))] - dirs.append(os.path.join(moose_dir, 'share', 'moose', 'python', 'TestHarness', 'testers')) - # Use the find_dep_apps script to get the dependant applications for an app - depend_app_dirs = findDepApps(app_name, use_current_only=True) - dirs.extend([os.path.join(my_dir, 'scripts', 'TestHarness') for my_dir in depend_app_dirs.split('\n')]) + # Get dependent applications and load dynamic tester plugins + # If applications have new testers, we expect to find them in /scripts/TestHarness/testers + # Use the find_dep_apps script to get the dependent applications for an app + app_dirs = findDepApps(app_name, use_current_only=True).split('\n') + # For installed binaries, the apps will exist in RELEASE_PATH/scripts, where in + # this case RELEASE_PATH is moose_dir + share_dir = os.path.join(moose_dir, 'share') + for dir in os.listdir(share_dir): + if dir != 'moose': + app_dirs.append(os.path.join(share_dir, dir)) + # Add scripts/TestHarness for all of the above + dirs.extend([os.path.join(my_dir, 'scripts', 'TestHarness') for my_dir in app_dirs]) # Finally load the plugins! self.factory.loadPlugins(dirs, 'testers', "IS_TESTER") From 45ab8642ce77e2aa5439ce796a77a43b008c556e Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 8 May 2024 15:47:01 -0600 Subject: [PATCH 026/243] Only check if the share dir exists --- python/TestHarness/TestHarness.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 74173c3d8726..48def6b17052 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -218,9 +218,10 @@ def __init__(self, argv, moose_dir, app_name=None, moose_python=None): # For installed binaries, the apps will exist in RELEASE_PATH/scripts, where in # this case RELEASE_PATH is moose_dir share_dir = os.path.join(moose_dir, 'share') - for dir in os.listdir(share_dir): - if dir != 'moose': - app_dirs.append(os.path.join(share_dir, dir)) + if os.path.isdir(share_dir): + for dir in os.listdir(share_dir): + if dir != 'moose': # already included + app_dirs.append(os.path.join(share_dir, dir)) # Add scripts/TestHarness for all of the above dirs.extend([os.path.join(my_dir, 'scripts', 'TestHarness') for my_dir in app_dirs]) From 7072bf996e163a61a1f3c6c1a67c0aec3f6243fb Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 8 May 2024 16:38:03 -0600 Subject: [PATCH 027/243] Install testers if they exist --- framework/app.mk | 11 +++++++++++ framework/moose.mk | 5 ++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/framework/app.mk b/framework/app.mk index 5c281d576ba5..a8111811520a 100644 --- a/framework/app.mk +++ b/framework/app.mk @@ -498,6 +498,17 @@ install_data_%: @mkdir -p $($@_dst) @cp -r $($@_src) $($@_dst) +ifneq ($(wildcard $(APPLICATION_DIR)/scripts/TestHarness/testers),) +install_tester_$(APPLICATION_NAME)_src := $(APPLICATION_DIR)/scripts/TestHarness/testers +install_tester_$(APPLICATION_NAME)_dst := $(share_install_dir)/scripts/TestHarness +install_testers:: install_tester_$(APPLICATION_NAME) +endif + +install_tester_%: + @echo "Installing TestHarness testers "$($@_dst)"..." + @mkdir -p $($@_dst) + @cp -r $($@_src) $($@_dst) + $(copy_input_targets): @$(eval kv := $(subst ->, ,$(subst target_$(APPLICATION_NAME)_,,$@))) @$(eval source_dir := $(word 1, $(kv))) diff --git a/framework/moose.mk b/framework/moose.mk index 4fee7dc0770e..2e5a2dba7096 100644 --- a/framework/moose.mk +++ b/framework/moose.mk @@ -484,12 +484,15 @@ moose_share_dir = $(share_dir)/moose python_install_dir = $(moose_share_dir)/python bin_install_dir = $(PREFIX)/bin -install: all install_all_libs install_bin install_harness install_exodiff install_adreal_monolith install_hit install_data +install: all install_all_libs install_bin install_harness install_exodiff install_adreal_monolith install_hit install_data install_testers install_data:: @mkdir -p $(moose_share_dir) @cp -a $(FRAMEWORK_DIR)/data $(moose_share_dir)/ +install_testers:: + @: + install_adreal_monolith: ADRealMonolithic.h @ mkdir -p $(moose_include_dir) @cp -f $< $(moose_include_dir)/ From d7ffc7b969d96d26996c66477f0950a616df5b27 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 8 May 2024 20:48:20 -0600 Subject: [PATCH 028/243] Let HPC jobs only use one slot --- python/TestHarness/schedulers/RunPBS.py | 7 ++++++- python/TestHarness/schedulers/Scheduler.py | 21 ++++++++++++++++----- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index f6043fb41552..fee1496543d0 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -138,8 +138,13 @@ def callPBS(self, command): raise RunPBS.CallPBSException(self, 'Failed to execute remote PBS command', command) from e return exit_code, result.rstrip() + def getJobSlots(self, job): + # Jobs only use one slot because they are ran externally + return 1 + def availableSlots(self, params): - return 250, False + # Support managing 500 HPC jobs concurrently + return 500, False def getPBSJobName(self, job): """Gets the name of the PBS job given a tester diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index b577137541e6..0bf9ca5b167a 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -301,6 +301,15 @@ def satisfyLoad(self): while self.slots_in_use > 1 and self.getLoad() >= self.average_load: sleep(1.0) + def getJobSlots(self, job): + """ + Gets the number of slots a job will use. + + This exists so that HPC runners can override it, as + jobs like PBS jobs only use one slot because they are + ran externally.""" + return job.getSlots() + def reserveSlots(self, job, j_lock): """ Method which allocates resources to perform the job. Returns bool if job @@ -311,23 +320,25 @@ def reserveSlots(self, job, j_lock): self.satisfyLoad() with self.slot_lock: + job_slots = self.getJobSlots(job) + can_run = False - if self.slots_in_use + job.getSlots() <= self.available_slots: + if self.slots_in_use + job_slots <= self.available_slots: can_run = True # Check for insufficient slots -soft limit - elif job.getSlots() > self.available_slots and self.soft_limit: + elif job_slots > self.available_slots and self.soft_limit: job.addCaveats('OVERSIZED') can_run = True # Check for insufficient slots -hard limit (skip this job) - elif job.getSlots() > self.available_slots and not self.soft_limit: + elif job_slots > self.available_slots and not self.soft_limit: job.addCaveats('insufficient slots') with j_lock: job.setStatus(job.skip) if can_run: - self.slots_in_use += job.getSlots() + self.slots_in_use += job_slots return can_run def handleTimeoutJob(self, job, j_lock): @@ -470,7 +481,7 @@ def runJob(self, job, jobs, j_lock): # Recover worker count before attempting to queue more jobs with self.slot_lock: - self.slots_in_use = max(0, self.slots_in_use - job.getSlots()) + self.slots_in_use = max(0, self.slots_in_use - self.getJobSlots(job)) # Stop the long running timer if job.report_timer: From 046525b06d3aef79d4f87fbdc447230224f4f1e8 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 8 May 2024 20:48:40 -0600 Subject: [PATCH 029/243] Reduce slots to 200 --- python/TestHarness/schedulers/RunPBS.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index fee1496543d0..288e75947ef7 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -143,8 +143,8 @@ def getJobSlots(self, job): return 1 def availableSlots(self, params): - # Support managing 500 HPC jobs concurrently - return 500, False + # Support managing 200 HPC jobs concurrently + return 200, False def getPBSJobName(self, job): """Gets the name of the PBS job given a tester From 9e3a052e98ab50d40d498418b61d2bed9cf52536 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 9 May 2024 11:26:58 -0600 Subject: [PATCH 030/243] Limit RunException tests to 1 rank in PBS --- python/TestHarness/testers/RunException.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/python/TestHarness/testers/RunException.py b/python/TestHarness/testers/RunException.py index 5d9a5790dbc3..5b808610a5c0 100644 --- a/python/TestHarness/testers/RunException.py +++ b/python/TestHarness/testers/RunException.py @@ -9,6 +9,7 @@ from TestHarness import util from RunApp import RunApp +import os class RunException(RunApp): @@ -35,6 +36,12 @@ def checkRunnable(self, options): self.addCaveats('type=RunException') self.setStatus(self.skip) return False + # We seem to have issues with --redirect-output causing + # "Inappropriate ioctl for device (25)" errors, so if this test + # requires more procs, we can't run it + if options.pbs and int(self.specs['min_parallel'] > 1): + self.addCaveats('PBS max_parallel=1') + return False return RunApp.checkRunnable(self, options) def prepare(self, options): @@ -47,6 +54,16 @@ def getOutputFiles(self, options): return self.getRedirectedOutputFiles(options) return [] + def getProcs(self, options): + procs = super().getProcs(options) + # We seem to have issues with --redirect-output causing + # "Inappropriate ioctl for device (25)" errors, so if this test + # doesn't require more procs, just set it to zero + if options.pbs and int(self.specs['min_parallel']) == 1 and procs != 1: + self.addCaveats('PBS max_parallel=1') + return 1 + return procs + def processResults(self, moose_dir, options, output): # Exceptions are written to stderr, which can be interleaved so we normally redirect these # separate files. Here we must gather those file outputs before processing From e823658312d69aad9b8a44d92198d458d2c35fd9 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 9 May 2024 11:31:10 -0600 Subject: [PATCH 031/243] Rename to max_cpus for consistency --- python/TestHarness/testers/RunException.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/TestHarness/testers/RunException.py b/python/TestHarness/testers/RunException.py index 5b808610a5c0..6783fe699667 100644 --- a/python/TestHarness/testers/RunException.py +++ b/python/TestHarness/testers/RunException.py @@ -40,7 +40,7 @@ def checkRunnable(self, options): # "Inappropriate ioctl for device (25)" errors, so if this test # requires more procs, we can't run it if options.pbs and int(self.specs['min_parallel'] > 1): - self.addCaveats('PBS max_parallel=1') + self.addCaveats('PBS max_cpus=1') return False return RunApp.checkRunnable(self, options) @@ -60,7 +60,7 @@ def getProcs(self, options): # "Inappropriate ioctl for device (25)" errors, so if this test # doesn't require more procs, just set it to zero if options.pbs and int(self.specs['min_parallel']) == 1 and procs != 1: - self.addCaveats('PBS max_parallel=1') + self.addCaveats('PBS max_cpus=1') return 1 return procs From d55d14d8cc6f5dac4576bcf153bd81587d1104e9 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 9 May 2024 16:06:18 -0600 Subject: [PATCH 032/243] Add and check terminator for PBS output files --- python/TestHarness/runners/PBSRunner.py | 133 ++++++++++++++++++--- python/TestHarness/schedulers/RunPBS.py | 13 +- python/TestHarness/schedulers/pbs_template | 19 +++ 3 files changed, 145 insertions(+), 20 deletions(-) diff --git a/python/TestHarness/runners/PBSRunner.py b/python/TestHarness/runners/PBSRunner.py index dd917647bee6..c1d5478fddc0 100644 --- a/python/TestHarness/runners/PBSRunner.py +++ b/python/TestHarness/runners/PBSRunner.py @@ -8,7 +8,7 @@ #* https://www.gnu.org/licenses/lgpl-2.1.html from TestHarness.runners.Runner import Runner -import time, os +import re, time, os, subprocess class PBSRunner(Runner): """Runner that spawns a process with PBS. @@ -24,7 +24,7 @@ def __init__(self, job, options, run_pbs): # if the job ended in an unexpected state, it might # not even be using the output and we don't want to # just hang forever - self.wait_output_time = 60 + self.wait_output_time = 120 def spawn(self, timer): from TestHarness.schedulers.RunPBS import RunPBS @@ -86,26 +86,18 @@ def wait(self, timer): wait_files.add(os.path.join(tester.getTestDir(), file)) # Wait for all of the files to be available - file_poll_interval = 0.5 + file_poll_interval = 0.25 waited_time = 0 while wait_files: # Look for each file for file in wait_files.copy(): - # File exists - if os.path.exists(file) and os.path.isfile(file): - # Special case for stdout/stderr, where we append - # something to the end to show that it's actually done - # and then need to read it - # TODO: shorten output as an option? - if file == output_file: - output = open(file, 'r').read() - ending_comment = self.run_pbs.getOutputEndingComment() - if ending_comment in output: - self.output = output.replace(ending_comment, '') - else: - continue - # Done with this file - wait_files.discard(file) + if not self.checkFile(file): + continue + # Store the output + if file == output_file: + self.output = open(file, 'r').read() + # Done with this file + wait_files.discard(file) # We've waited for files for too long if wait_files and waited_time >= self.wait_output_time: @@ -121,5 +113,110 @@ def wait(self, timer): waited_time += file_poll_interval time.sleep(file_poll_interval) + def checkFile(self, file): + """ + Checks if a file is ready for reading. + + In summary: + - Check if the file exists + - If the file exists, make sure that it has the terminator + string (to know that we have the full file) + - Remove the terminator string + """ + if not os.path.exists(file) or not os.path.isfile(file): + return False + + # The file terminator check (to we have the up-to-date copy of the file) + # is dependent on whether or not the file is a binary + is_binary = self.isFileBinary(file) + # If this returns None, it means that the "file" command couldn't determine + # the file type, which may be the case if we have an incomplete file so + # just continue and check on the next iteration + if is_binary is None: + return False + + ending_comment = self.run_pbs.getOutputEndingComment() + + # Binary file + if is_binary: + with open(file, "rb+") as file: + # We'll be looking for this many characters + len_comment = len(ending_comment) + + # Move to the end and figure out the position + # back where our terminator should be + file.seek(0, os.SEEK_END) + pos = file.tell() - len_comment + + # File is shorter than our comment + if pos < 0: + return False + + # Move to the position where our terminator _should_ be + file.seek(pos) + + # We try here in the event that we're loading + # an earlier part of the file and we can't decode + try: + contents = file.read(len_comment).decode('utf-8') + except: + return False + + # Terminator isn't there + if contents != ending_comment: + return False + + # Remove the terminator + file.seek(pos) + file.truncate() + + return True + # Text file + else: + # Load just the last line of the file + last_line = subprocess.check_output(['tail', '-1', file], text=True) + # Found the match, remove the last line and consider this file available + if ending_comment == last_line: + self.removeLastLine(file) + return True + + return False + + @staticmethod + def removeLastLine(file): + """ + Removes the last line from the given text file. + + Used to remove the terminator that we append to all output + files on the compute host in order to make sure that the + entire output file is synced""" + # stackoverflow.com/questions/1877999/delete-final-line-in-file-with-python + with open(file, "r+", encoding="utf-8") as file: + file.seek(0, os.SEEK_END) + pos = file.tell() - 1 + while pos > 0 and file.read(1) != "\n": + pos -= 1 + file.seek(pos, os.SEEK_SET) + if pos > 0: + file.seek(pos, os.SEEK_SET) + file.truncate() + + @staticmethod + def isFileBinary(file): + """ + Returns whether or not the given file is a binary file. + + If None, a failure was encountered when checking the file type. + """ + try: + call_file = subprocess.check_output(['file', '--mime-encoding', file], text=True) + except: + return None + + # Will return something like ": ", + # where =binary when the file is binary + find_binary = re.search('binary$', call_file) + return find_binary is not None + def kill(self): self.run_pbs.killJob(self.job) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 288e75947ef7..d4ea64bf800f 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -207,6 +207,14 @@ def submitJob(self, job): # Add MOOSE's python path for python scripts moose_python = os.path.abspath(os.path.join(os.path.abspath(os.path.dirname(__file__)), '../..')) + # The output files that we're expected to generate so that the + # PBS job can add a terminator for them so that we can verify + # they are complete on the executing host + output_files = [] + for file in tester.getOutputFiles(options): + output_files.append(f'"{os.path.join(tester.getTestDir(), file)}"') + output_files = ' '.join(output_files) + # Set up the template template_env = {'NAME': self.getPBSJobName(job), 'SELECT': f'{num_procs}:mpiprocs=1:ncpus={num_threads}', @@ -218,7 +226,8 @@ def submitJob(self, job): 'CWD': tester.getTestDir(), 'COMMAND': full_command, 'ENDING_COMMENT': self.getOutputEndingComment(), - 'MOOSE_PYTHONPATH': moose_python} + 'MOOSE_PYTHONPATH': moose_python, + 'OUTPUT_FILES': output_files} if self.options.queue_queue: template_env['QUEUE'] = self.options.queue_queue if self.options.queue_source_command: @@ -310,7 +319,7 @@ def buildRunner(self, job, options): def getOutputEndingComment(self): """Gets the text we append to the PBS stderr+stdout file to desginate that it is complete""" - return 'Completed TestHarness RunPBS job' + return 'TESTHARNESS RUNPBS FILE TERMINATOR' def getPBSJob(self, job): """Gets the PBSJob object for a given Job diff --git a/python/TestHarness/schedulers/pbs_template b/python/TestHarness/schedulers/pbs_template index c4c1715237c0..c6678818fd49 100644 --- a/python/TestHarness/schedulers/pbs_template +++ b/python/TestHarness/schedulers/pbs_template @@ -42,6 +42,25 @@ set +e # ...and capture the return code cause we're not done yet return_code=$? +set +e + +# Append a terminator to all of the output files for file syncing across NFS +OUTPUT_FILES=({{ OUTPUT_FILES }}) +for file in ${OUTPUT_FILES[@]}; do + if [ ! -e "$file" ]; then + echo "Failed to find output file $file" + exit 1 + fi + + # No newline for binaries + if [[ $(file --mime-encoding ${file}) = *binary ]]; then + printf "{{ ENDING_COMMENT }}" >> $file; + # Newline for plain text + else + printf "\n{{ ENDING_COMMENT }}" >> $file; + fi +done + # Append a recognizable string at the end of the output. We look # for this string when parsing the output so that we can be sure # that we have obtained all of the output From a221e79e8440973e79819b8778b8e4f44ba8cb42 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 9 May 2024 21:08:08 -0600 Subject: [PATCH 033/243] Fix terminator checking, add comments --- python/TestHarness/runners/PBSRunner.py | 41 +++++++++++++++------- python/TestHarness/schedulers/RunPBS.py | 5 ++- python/TestHarness/schedulers/pbs_template | 7 ++-- 3 files changed, 38 insertions(+), 15 deletions(-) diff --git a/python/TestHarness/runners/PBSRunner.py b/python/TestHarness/runners/PBSRunner.py index c1d5478fddc0..d56d520aa363 100644 --- a/python/TestHarness/runners/PBSRunner.py +++ b/python/TestHarness/runners/PBSRunner.py @@ -173,15 +173,32 @@ def checkFile(self, file): return True # Text file else: - # Load just the last line of the file - last_line = subprocess.check_output(['tail', '-1', file], text=True) - # Found the match, remove the last line and consider this file available - if ending_comment == last_line: - self.removeLastLine(file) + line, pos = self.getLastLine(file) + if ending_comment == line: + with open(file, "r+", encoding="utf-8") as f: + f.seek(pos) + f.truncate() return True return False + @staticmethod + def getLastLine(file): + """ + Gets the last line of a text file and the position + in the file at which that last line is. + """ + with open(file, 'rb') as f: + try: + f.seek(-2, os.SEEK_END) + while f.read(1) != b'\n': + f.seek(-2, os.SEEK_CUR) + except OSError: # one line filecd + f.seek(0) + pos = f.tell() + line = f.readline().decode('utf-8') + return line, pos + @staticmethod def removeLastLine(file): """ @@ -191,15 +208,15 @@ def removeLastLine(file): files on the compute host in order to make sure that the entire output file is synced""" # stackoverflow.com/questions/1877999/delete-final-line-in-file-with-python - with open(file, "r+", encoding="utf-8") as file: - file.seek(0, os.SEEK_END) - pos = file.tell() - 1 - while pos > 0 and file.read(1) != "\n": + with open(file, "r+", encoding="utf-8") as f: + f.seek(0, os.SEEK_END) + pos = f.tell() - 1 + while pos > 0 and f.read(1) != "\n": pos -= 1 - file.seek(pos, os.SEEK_SET) + f.seek(pos, os.SEEK_SET) if pos > 0: - file.seek(pos, os.SEEK_SET) - file.truncate() + f.seek(pos, os.SEEK_SET) + f.truncate() @staticmethod def isFileBinary(file): diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index d4ea64bf800f..2ec36c1db2b9 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -195,9 +195,12 @@ def submitJob(self, job): command = command.replace(mpi_command, '') full_command += mpi_command + # Wrap the command with apptainer if we're in a container, and also bind + # in the root directory that the test is contained in APPTAINER_CONTAINER = os.environ.get('APPTAINER_CONTAINER') if APPTAINER_CONTAINER: - full_command += f'apptainer exec {APPTAINER_CONTAINER} ' + root_path = os.path.abspath(tester.getTestDir()).split(os.path.sep)[1] + full_command += f'apptainer exec -B /{root_path} {APPTAINER_CONTAINER} ' full_command += f'"{command}"' num_procs = tester.getProcs(options) diff --git a/python/TestHarness/schedulers/pbs_template b/python/TestHarness/schedulers/pbs_template index c6678818fd49..1a3affb94054 100644 --- a/python/TestHarness/schedulers/pbs_template +++ b/python/TestHarness/schedulers/pbs_template @@ -10,6 +10,7 @@ #PBS -o {{ OUTPUT }} #PBS -l place={{ PLACE }} +# Exit on failure set -e {%- if SOURCE_FILE is defined %} @@ -35,6 +36,7 @@ echo "########################################################################## # Move into the test directory cd {{ CWD }} +# Don't exit on failure: need to capture the actual run's return code set +e # Run the command @@ -42,7 +44,8 @@ set +e # ...and capture the return code cause we're not done yet return_code=$? -set +e +# Exit on failure +set -e # Append a terminator to all of the output files for file syncing across NFS OUTPUT_FILES=({{ OUTPUT_FILES }}) @@ -64,7 +67,7 @@ done # Append a recognizable string at the end of the output. We look # for this string when parsing the output so that we can be sure # that we have obtained all of the output -echo "{{ ENDING_COMMENT }}" +printf "\n{{ ENDING_COMMENT }}" # Exit with the real return code from the job that we ran exit $return_code From 9da7d61c5dfa9cbf4369413205cc581dd28a2f03 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 9 May 2024 22:13:05 -0600 Subject: [PATCH 034/243] Also print out incomplete files --- python/TestHarness/runners/PBSRunner.py | 42 +++++++++++++++---------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/python/TestHarness/runners/PBSRunner.py b/python/TestHarness/runners/PBSRunner.py index d56d520aa363..3fa45bff0503 100644 --- a/python/TestHarness/runners/PBSRunner.py +++ b/python/TestHarness/runners/PBSRunner.py @@ -84,36 +84,47 @@ def wait(self, timer): if tester.mustOutputExist(): for file in tester.getOutputFiles(self.options): wait_files.add(os.path.join(tester.getTestDir(), file)) + # The files that we can read, but are incomplete (no terminator) + incomplete_files = set() # Wait for all of the files to be available file_poll_interval = 0.25 waited_time = 0 - while wait_files: + while wait_files or incomplete_files: # Look for each file for file in wait_files.copy(): - if not self.checkFile(file): - continue - # Store the output - if file == output_file: - self.output = open(file, 'r').read() - # Done with this file - wait_files.discard(file) + if os.path.exists(file) and os.path.isfile(file): + wait_files.discard(file) + incomplete_files.add(file) + + # Check for file completeness + for file in incomplete_files.copy(): + if self.fileIsReady(file): + # Store the output + if file == output_file: + self.output = open(file, 'r').read() + # Done with this file + incomplete_files.discard(file) # We've waited for files for too long - if wait_files and waited_time >= self.wait_output_time: + if (wait_files or incomplete_files) and waited_time >= self.wait_output_time: self.job.setStatus(self.job.error, 'FILE TIMEOUT') if not self.output: self.output = '' - self.output += '#' * 80 + '\nUnavailable output file(s)\n' + '#' * 80 + '\n' - for file in wait_files: - self.output += file + '\n' - self.output += '\n' + def print_files(files, type): + if files: + self.output += '#' * 80 + f'\n{type} output file(s)\n' + '#' * 80 + '\n' + for file in files: + self.output += file + '\n' + self.output += '\n' + print_files(wait_files, 'Unavailable') + print_files(incomplete_files, 'Incomplete') break waited_time += file_poll_interval time.sleep(file_poll_interval) - def checkFile(self, file): + def fileIsReady(self, file): """ Checks if a file is ready for reading. @@ -123,9 +134,6 @@ def checkFile(self, file): string (to know that we have the full file) - Remove the terminator string """ - if not os.path.exists(file) or not os.path.isfile(file): - return False - # The file terminator check (to we have the up-to-date copy of the file) # is dependent on whether or not the file is a binary is_binary = self.isFileBinary(file) From 9a45a286493cb598efcf557ba0ef8f586b69a561 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 13 May 2024 11:33:48 -0600 Subject: [PATCH 035/243] Run 250 jobs instead --- python/TestHarness/schedulers/RunPBS.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 2ec36c1db2b9..202e0e4a5308 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -143,8 +143,8 @@ def getJobSlots(self, job): return 1 def availableSlots(self, params): - # Support managing 200 HPC jobs concurrently - return 200, False + # Support managing 250 HPC jobs concurrently + return 250, False def getPBSJobName(self, job): """Gets the name of the PBS job given a tester From fa0ba0d4ff0310cebc17b320e24f1938568169f2 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 13 May 2024 11:34:20 -0600 Subject: [PATCH 036/243] Don't run in parallel with PBS --- python/TestHarness/JobDAG.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/JobDAG.py b/python/TestHarness/JobDAG.py index be92e8af4df4..ef5123d96809 100644 --- a/python/TestHarness/JobDAG.py +++ b/python/TestHarness/JobDAG.py @@ -61,7 +61,7 @@ def getDAG(self): def getJobs(self): """ Return a list of available jobs """ - if (self.canParallel() or self.options.pbs) and not self.options.pedantic_checks: + if self.canParallel() and not self.options.pedantic_checks: return self.__job_dag.ind_nodes() return self.getJob() From ff09735d59fae677abf1ddce8eb2c8b8610304f7 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 13 May 2024 15:11:17 -0600 Subject: [PATCH 037/243] Disable use_shell with apptainer PBS --- python/TestHarness/testers/Tester.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index a15c64011b78..8fd6624560c8 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -756,6 +756,9 @@ def checkRunnableBase(self, options): or options.sep_files): reasons['working_directory'] = '--sep-files* enabled' + if self.specs['use_shell'] and options.pbs and os.environ.get('APPTAINER_CONTAINER'): + reasons['use_shell'] = 'no use_shell with apptainer PBS' + ##### The below must be performed last to register all above caveats ##### # Remove any matching user supplied caveats from accumulated checkRunnable caveats that # would normally produce a skipped test. From 769cb748964575bbbc2c15005c2e18181d20fb32 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 13 May 2024 15:38:47 -0600 Subject: [PATCH 038/243] Add dependency checking for race conditions --- python/TestHarness/JobDAG.py | 40 ++++++++++++++++++++++++++++ python/TestHarness/schedulers/Job.py | 10 ++++++- python/TestHarness/testers/Tester.py | 3 +++ python/contrib/dag/__init__.py | 14 ++++++++++ 4 files changed, 66 insertions(+), 1 deletion(-) diff --git a/python/TestHarness/JobDAG.py b/python/TestHarness/JobDAG.py index ef5123d96809..8314eb1d2956 100644 --- a/python/TestHarness/JobDAG.py +++ b/python/TestHarness/JobDAG.py @@ -11,6 +11,7 @@ from contrib import dag import pyhit import os +import sys class JobDAG(object): """ Class which builds a Job DAG for use by the Scheduler """ @@ -103,6 +104,7 @@ def _checkDAG(self): self._doMakeDependencies() self._doLast() + self._checkOutputCollisions() # If there are race conditions, then there may be more skipped jobs if self._doRaceConditions(): @@ -219,6 +221,44 @@ def _doSkippedDependencies(self): d_job.addCaveats('skipped dependency') self.__job_dag.delete_edge_if_exists(job, d_job) + def _checkOutputCollisions(self): + """ + If running in parallel, checks to see if any tests have outputs + that would collide when ran in parallel if prereqs are set. + """ + # No need to check if this spec can't run in parallel, because + # all tests will be run sequentially, with no more than one at once + if not self.canParallel(): + return + + jobs = list(self.__job_dag.topological_sort()) + # Sort by ID so we get it in the input file from top down + jobs = sorted(jobs, key = lambda job: job.getID()) + + # Work down the file, starting with the second input and looking up for + # collisions. By doing it in this order, we will error at the first occurance. + # This is nice because if we list all of the collisions it could be a lot of + # confusing output + for i in range(1, len(jobs)): + job = jobs[i] + for other_i in reversed(range(i)): + other_job = jobs[other_i] + tester = job.getTester() + other_tester = other_job.getTester() + files = set(tester.getOutputFiles(self.options)) + other_files = set(other_tester.getOutputFiles(self.options)) + conflicting_files = list(files.intersection(other_files)) + if conflicting_files \ + and not self.__job_dag.is_dependency(other_job, job): + print(f'In {tester.getSpecFile()}:\n') + print(' This test spec is set to run in parallel, but a race condition was found') + print(' that could lead to multiple tests reading/writing from the same file.\n') + print(f' Tests: {tester.getTestNameShort()}, {other_tester.getTestNameShort()}') + print(f' File(s): {", ".join(conflicting_files)}\n') + print(' You can resolve this issue by setting the approprate prerequisites') + print(' between your tests with the "prereq" parameter') + sys.exit(1) + def _doRaceConditions(self): """ Check for race condition errors within in the DAG""" # Build output_file in relation to job dictionary diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index fb1052c2d9ca..94e101da6d32 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -7,7 +7,7 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html -import re, os, json, time +import itertools, re, os, json, time from timeit import default_timer as clock from TestHarness.StatusSystem import StatusSystem from TestHarness.FileChecker import FileChecker @@ -44,7 +44,11 @@ class Job(object): The Job class is a simple container for the tester and its associated output file object, the DAG, the process object, the exit codes, and the start and end times. """ + # Iterator for producing a unique Job ID + id_iter = itertools.count() + def __init__(self, tester, job_dag, options): + self.id = next(self.id_iter) self.options = options self.__tester = tester self.specs = tester.specs @@ -98,6 +102,10 @@ def __init__(self, tester, job_dag, options): # the next time report statuses self.force_report_status = False + def getID(self): + """Returns the unique ID for the job""" + return self.id + def getUpstreams(self): """ Return a list of all the jobs that needed to be completed before this job """ dag = self.getDAG() diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index 8fd6624560c8..3b155f68e9e0 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -278,6 +278,9 @@ def getTestDir(self): return os.path.join(self.specs['test_dir'], self.specs['working_directory']) return self.specs['test_dir'] + def getSpecFile(self): + return os.path.join(self.getTestDir(), self.specs['spec_file']) + def getMinReportTime(self): """ return minimum time elapse before reporting a 'long running' status """ return self.specs['min_reported_time'] diff --git a/python/contrib/dag/__init__.py b/python/contrib/dag/__init__.py index ca7a75d1386b..4678a93a14b4 100644 --- a/python/contrib/dag/__init__.py +++ b/python/contrib/dag/__init__.py @@ -316,3 +316,17 @@ def delete_edge_if_exists(self, ind_node, dep_node, graph=None): if dep_node not in graph.get(ind_node, []): return graph[ind_node].remove(dep_node) + + # Added by the MOOSE group + def is_dependency(self, ind_node, dep_node, graph=None): + """ Returns whether or not dep_node depends on ind_node """ + if not graph: + graph = self.graph + + deps = graph[ind_node] + if dep_node in deps: + return True + for node in deps: + if self.is_dependency(node, dep_node, graph=graph): + return True + return False From d02f2721098c91003c9850af5331820ada4523a0 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 13 May 2024 16:07:14 -0600 Subject: [PATCH 039/243] Use correct variable --- python/TestHarness/runners/SubprocessRunner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/runners/SubprocessRunner.py b/python/TestHarness/runners/SubprocessRunner.py index d15ca9140da2..a7c2ae76f087 100644 --- a/python/TestHarness/runners/SubprocessRunner.py +++ b/python/TestHarness/runners/SubprocessRunner.py @@ -61,7 +61,7 @@ def spawn(self, timer): process_env = os.environ.copy() # Don't clobber state - process_env['OMPI_MCA_orte_tmpdir_base'] = self.getTester().getTempDirectory().name + process_env['OMPI_MCA_orte_tmpdir_base'] = tester.getTempDirectory().name # Allow oversubscription for hosts that don't have a hostfile process_env['PRTE_MCA_rmaps_default_mapping_policy'] = ':oversubscribe' From 72d9c52e89c79511672bca1682ed4c50bae9dfa5 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 13 May 2024 16:19:50 -0600 Subject: [PATCH 040/243] Add option to skip for HPC tests --- python/TestHarness/testers/Tester.py | 7 +++++++ test/tests/vectorpostprocessors/work_balance/tests | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index 3b155f68e9e0..6974f41d257b 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -114,6 +114,9 @@ def validParams(): params.addParam("deprecated", False, "When True the test is no longer considered part SQA process and as such does not include the need for a requirement definition.") params.addParam("collections", [], "A means for defining a collection of tests for SQA process.") params.addParam("classification", 'functional', "A means for defining a requirement classification for SQA process.") + + params.addParam('hpc', True, 'Set to false to not run with HPC schedulers (PBS and slurm)') + return params def __del__(self): @@ -762,6 +765,10 @@ def checkRunnableBase(self, options): if self.specs['use_shell'] and options.pbs and os.environ.get('APPTAINER_CONTAINER'): reasons['use_shell'] = 'no use_shell with apptainer PBS' + # Explicitly skip HPC tests + if not self.specs['hpc'] and options.pbs: + reasons['hpc'] = 'hpc=false' + ##### The below must be performed last to register all above caveats ##### # Remove any matching user supplied caveats from accumulated checkRunnable caveats that # would normally produce a skipped test. diff --git a/test/tests/vectorpostprocessors/work_balance/tests b/test/tests/vectorpostprocessors/work_balance/tests index 5f273b419899..3c67f70d7d6a 100644 --- a/test/tests/vectorpostprocessors/work_balance/tests +++ b/test/tests/vectorpostprocessors/work_balance/tests @@ -13,7 +13,7 @@ min_parallel = 2 max_parallel = 2 mesh_mode = replicated - + hpc = False # different MPI partitioning detail = 'on replicated meshes, and' [] @@ -25,6 +25,7 @@ min_parallel = 2 max_parallel = 2 mesh_mode = distributed + hpc = False # different MPI partitioning detail = 'on distributed meshes.' [] From e5869d3312766500c60cd1a4b7b414887e1e8fb3 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 13 May 2024 16:52:50 -0600 Subject: [PATCH 041/243] Try to get output more --- python/TestHarness/runners/PBSRunner.py | 28 ++++++++++++++++--------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/python/TestHarness/runners/PBSRunner.py b/python/TestHarness/runners/PBSRunner.py index 3fa45bff0503..5dc761a977cd 100644 --- a/python/TestHarness/runners/PBSRunner.py +++ b/python/TestHarness/runners/PBSRunner.py @@ -59,15 +59,9 @@ def wait(self, timer): # so we have an invalid state for processing in the Tester if self.job.isFinished(): self.exit_code = -1 - self.output = '' # If we have output, we should try to add it - # TODO: shorten output as an option? - if os.path.exists(output_file) and os.path.isfile(output_file): - try: - self.output = open(file, 'r').read() - except: - pass + self.trySetOutput() # Don't bother looking for the rest of the output return @@ -102,15 +96,15 @@ def wait(self, timer): if self.fileIsReady(file): # Store the output if file == output_file: - self.output = open(file, 'r').read() + self.trySetOutput(throw=True) # Done with this file incomplete_files.discard(file) # We've waited for files for too long if (wait_files or incomplete_files) and waited_time >= self.wait_output_time: self.job.setStatus(self.job.error, 'FILE TIMEOUT') - if not self.output: - self.output = '' + if self.output is None: + self.trySetOutput() def print_files(files, type): if files: self.output += '#' * 80 + f'\n{type} output file(s)\n' + '#' * 80 + '\n' @@ -124,6 +118,20 @@ def print_files(files, type): waited_time += file_poll_interval time.sleep(file_poll_interval) + def trySetOutput(self, throw=False): + if self.output is None: + self.output = '' + + # TODO: shorten output as an option? + output_file = self.run_pbs.getPBSJobOutputPath(self.job) + if os.path.exists(output_file) and os.path.isfile(output_file): + try: + self.output = open(output_file, 'r').read() + except: + if throw: + raise + pass + def fileIsReady(self, file): """ Checks if a file is ready for reading. From 34ce169db6104ea4cae8dbfcdf7f120ca7d38cad Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 13 May 2024 16:53:09 -0600 Subject: [PATCH 042/243] Don't bother exiting here, just continue --- python/TestHarness/schedulers/pbs_template | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/TestHarness/schedulers/pbs_template b/python/TestHarness/schedulers/pbs_template index 1a3affb94054..a4b940948185 100644 --- a/python/TestHarness/schedulers/pbs_template +++ b/python/TestHarness/schedulers/pbs_template @@ -51,8 +51,7 @@ set -e OUTPUT_FILES=({{ OUTPUT_FILES }}) for file in ${OUTPUT_FILES[@]}; do if [ ! -e "$file" ]; then - echo "Failed to find output file $file" - exit 1 + continue fi # No newline for binaries From 28fa313ca8e4583991019a4d163a7b623ff097af Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 13 May 2024 16:57:15 -0600 Subject: [PATCH 043/243] Skip these tests due to the inner mpiexec call --- test/tests/samplers/distribute/tests | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/tests/samplers/distribute/tests b/test/tests/samplers/distribute/tests index 591d226c918e..8fc74a0b88af 100644 --- a/test/tests/samplers/distribute/tests +++ b/test/tests/samplers/distribute/tests @@ -14,6 +14,7 @@ python = 3 required_python_packages = 'pandas matplotlib' installation_type = in_tree # see #26480 + hpc = false # see #26480 [] [plot] type = CheckFiles @@ -26,6 +27,7 @@ required_python_packages = 'pandas matplotlib' detail = "demonstrates efficient parallel scaling of memory use." installation_type = in_tree # see #26480 + hpc = false # see #26480 [] [] [] From 1dec4326b6b6689f291f81b35f1f1c83320ba904 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 13 May 2024 17:48:17 -0600 Subject: [PATCH 044/243] Store and capture the exit codes --- python/TestHarness/runners/PBSRunner.py | 6 ++++++ python/TestHarness/schedulers/pbs_template | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/python/TestHarness/runners/PBSRunner.py b/python/TestHarness/runners/PBSRunner.py index 5dc761a977cd..59880de70e81 100644 --- a/python/TestHarness/runners/PBSRunner.py +++ b/python/TestHarness/runners/PBSRunner.py @@ -127,6 +127,12 @@ def trySetOutput(self, throw=False): if os.path.exists(output_file) and os.path.isfile(output_file): try: self.output = open(output_file, 'r').read() + + # If we can parse the exit code here, do it. Sometimes PBS + # will do screwy stuff with not capturing the actual exit code... + find_exit_code = re.search('Completed TestHarness RunPBS job; exit code = (\d+)', self.output) + if find_exit_code: + self.exit_code = int(find_exit_code.group(1)) except: if throw: raise diff --git a/python/TestHarness/schedulers/pbs_template b/python/TestHarness/schedulers/pbs_template index a4b940948185..407e0cdd8853 100644 --- a/python/TestHarness/schedulers/pbs_template +++ b/python/TestHarness/schedulers/pbs_template @@ -47,6 +47,8 @@ return_code=$? # Exit on failure set -e +echo "################################################################################" + # Append a terminator to all of the output files for file syncing across NFS OUTPUT_FILES=({{ OUTPUT_FILES }}) for file in ${OUTPUT_FILES[@]}; do @@ -63,6 +65,11 @@ for file in ${OUTPUT_FILES[@]}; do fi done +# We will read this output later on to try to capture the return code +# in the event that PBS doesn't get it to us correctly +echo "Completed TestHarness RunPBS job; exit code = $return_code" +echo "################################################################################" + # Append a recognizable string at the end of the output. We look # for this string when parsing the output so that we can be sure # that we have obtained all of the output From 0bbb86bc2eb11b671b35854af4d02f6a7297ec73 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 13 May 2024 17:54:18 -0600 Subject: [PATCH 045/243] Skip these tests for HPC --- test/tests/restart/advanced_stateful_material/tests | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/tests/restart/advanced_stateful_material/tests b/test/tests/restart/advanced_stateful_material/tests index 744184d193c5..4722f382eab4 100644 --- a/test/tests/restart/advanced_stateful_material/tests +++ b/test/tests/restart/advanced_stateful_material/tests @@ -19,6 +19,10 @@ expect_err = 'The stateful material properties in RestartStatefulMaterial "test" that are being restarted do not match the stored properties in the same material object from the checkpoint.' prereq = checkpoint detail = 'a stateful property for a single material object is removed' + # RunException with some cases will only run in serial, which means if + # "checkpoint" runs in parallel we'll actually get an error with a + # processor mismatch instead of the one that we want + hpc = false [] [add_prop] type = RunException From d77b40731181844971ab1780464fdbfdd9f66317 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 13 May 2024 18:08:59 -0600 Subject: [PATCH 046/243] Do better at parsing the mpi command --- python/TestHarness/schedulers/RunPBS.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 202e0e4a5308..260a95f011d5 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -167,9 +167,9 @@ def getPBSJobSubmissionPath(self, job): @staticmethod def parseMPICommand(command): - find_mpi = re.search('^(mpiexec -n [0-9]+ )', command) + find_mpi = re.search('^(\s+)?(mpiexec|mpirun)(\s+-(n|np)\s+\d+)?(\s+)?', command) if find_mpi is not None: - return find_mpi.group(1) + return find_mpi.group(0) return None def submitJob(self, job): From 726a21cb2f48145ddf588d4212df3a1be931b47a Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 13 May 2024 18:10:54 -0600 Subject: [PATCH 047/243] Don't actually skip this --- python/TestHarness/testers/Tester.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index 6974f41d257b..39394c7390f0 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -762,9 +762,6 @@ def checkRunnableBase(self, options): or options.sep_files): reasons['working_directory'] = '--sep-files* enabled' - if self.specs['use_shell'] and options.pbs and os.environ.get('APPTAINER_CONTAINER'): - reasons['use_shell'] = 'no use_shell with apptainer PBS' - # Explicitly skip HPC tests if not self.specs['hpc'] and options.pbs: reasons['hpc'] = 'hpc=false' From 74a3ffb9e0e74157f4ae7dbdeec79d95e24b8e7b Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 13 May 2024 19:51:26 -0600 Subject: [PATCH 048/243] Cleanup template, simplify command ran escapes --- python/TestHarness/runners/PBSRunner.py | 2 +- python/TestHarness/schedulers/RunPBS.py | 9 +++++++-- python/TestHarness/schedulers/pbs_template | 20 ++++++++++---------- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/python/TestHarness/runners/PBSRunner.py b/python/TestHarness/runners/PBSRunner.py index 59880de70e81..0fce17d6737d 100644 --- a/python/TestHarness/runners/PBSRunner.py +++ b/python/TestHarness/runners/PBSRunner.py @@ -130,7 +130,7 @@ def trySetOutput(self, throw=False): # If we can parse the exit code here, do it. Sometimes PBS # will do screwy stuff with not capturing the actual exit code... - find_exit_code = re.search('Completed TestHarness RunPBS job; exit code = (\d+)', self.output) + find_exit_code = re.search('Completed TestHarness RunPBS test execution; exit code = (\d+)', self.output) if find_exit_code: self.exit_code = int(find_exit_code.group(1)) except: diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 260a95f011d5..19a0c24d6f03 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -189,11 +189,14 @@ def submitJob(self, job): # Set up the command. We have special logic here for when we're using apptainer, # where we need to put the MPI command outside of the apptainer call full_command = '' - command = tester.getCommand(options).replace('"', "'") + command = tester.getCommand(options) mpi_command = self.parseMPICommand(command) if mpi_command: command = command.replace(mpi_command, '') full_command += mpi_command + # Split out whitespace in the command and then use json dumps to + # escape quoted characters + command = json.dumps(command.replace('\n', ' ')) # Wrap the command with apptainer if we're in a container, and also bind # in the root directory that the test is contained in @@ -201,7 +204,8 @@ def submitJob(self, job): if APPTAINER_CONTAINER: root_path = os.path.abspath(tester.getTestDir()).split(os.path.sep)[1] full_command += f'apptainer exec -B /{root_path} {APPTAINER_CONTAINER} ' - full_command += f'"{command}"' + + full_command += command num_procs = tester.getProcs(options) num_threads = tester.getThreads(options) @@ -225,6 +229,7 @@ def submitJob(self, job): 'PROJECT': self.options.queue_project, 'OUTPUT': output_file, 'PLACE': 'scatter', + 'TEST_NAME': tester.getTestName(), 'SUBMITTED_HOSTNAME': socket.gethostname(), 'CWD': tester.getTestDir(), 'COMMAND': full_command, diff --git a/python/TestHarness/schedulers/pbs_template b/python/TestHarness/schedulers/pbs_template index 407e0cdd8853..2b2d654aac14 100644 --- a/python/TestHarness/schedulers/pbs_template +++ b/python/TestHarness/schedulers/pbs_template @@ -22,16 +22,17 @@ set -e export PYTHONPATH={{ MOOSE_PYTHONPATH }}:${PYTHONPATH} # Print a useful header -echo "################################################################################" -echo "Beginning TestHarness RunPBS job" -echo "Submitted hostname: {{ SUBMITTED_HOSTNAME }}" -echo "Hostname: $(hostname)" echo "Time: $(date)" +echo "Test: {{ TEST_NAME }}" echo "Directory: {{ CWD }}" -echo "Output: {{ OUTPUT }}" echo "Command: {{ COMMAND }}" +echo "Hostname: $(hostname)" +echo "Submitted hostname: {{ SUBMITTED_HOSTNAME }}" +echo "Output: {{ OUTPUT }}" module list echo "################################################################################" +echo "Beginning TestHarness RunPBS test execution" +echo "################################################################################" # Move into the test directory cd {{ CWD }} @@ -47,6 +48,10 @@ return_code=$? # Exit on failure set -e +# We will read this output later on to try to capture the return code +# in the event that PBS doesn't get it to us correctly +echo "################################################################################" +echo "Completed TestHarness RunPBS test execution; exit code = $return_code" echo "################################################################################" # Append a terminator to all of the output files for file syncing across NFS @@ -65,11 +70,6 @@ for file in ${OUTPUT_FILES[@]}; do fi done -# We will read this output later on to try to capture the return code -# in the event that PBS doesn't get it to us correctly -echo "Completed TestHarness RunPBS job; exit code = $return_code" -echo "################################################################################" - # Append a recognizable string at the end of the output. We look # for this string when parsing the output so that we can be sure # that we have obtained all of the output From b9089e30d0500f2b5a997fbd30ada48d0f8193f6 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 13 May 2024 20:13:32 -0600 Subject: [PATCH 049/243] Remove the command because we can't escape it --- python/TestHarness/schedulers/pbs_template | 1 - 1 file changed, 1 deletion(-) diff --git a/python/TestHarness/schedulers/pbs_template b/python/TestHarness/schedulers/pbs_template index 2b2d654aac14..a1bf6612aa36 100644 --- a/python/TestHarness/schedulers/pbs_template +++ b/python/TestHarness/schedulers/pbs_template @@ -25,7 +25,6 @@ export PYTHONPATH={{ MOOSE_PYTHONPATH }}:${PYTHONPATH} echo "Time: $(date)" echo "Test: {{ TEST_NAME }}" echo "Directory: {{ CWD }}" -echo "Command: {{ COMMAND }}" echo "Hostname: $(hostname)" echo "Submitted hostname: {{ SUBMITTED_HOSTNAME }}" echo "Output: {{ OUTPUT }}" From 18b263ff77527dbbeb43a5f9f957162827612e69 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 14 May 2024 08:27:10 -0600 Subject: [PATCH 050/243] Only use mpiexec when we need to run in parallel --- python/mooseutils/mooseutils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/mooseutils/mooseutils.py b/python/mooseutils/mooseutils.py index 42539e23646c..616fe4129c29 100644 --- a/python/mooseutils/mooseutils.py +++ b/python/mooseutils/mooseutils.py @@ -165,7 +165,7 @@ def run_executable(app_path, *args, mpi=None, suppress_output=False): A function for running an application. """ import subprocess - if mpi and isinstance(mpi, int): + if mpi and isinstance(mpi, int) and mpi > 1: cmd = ['mpiexec', '-n', str(mpi), app_path] else: cmd = [app_path] From 39b1073ef3419117b6effec63f89ea3c041760c8 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 14 May 2024 08:27:23 -0600 Subject: [PATCH 051/243] Limit parallel python unit tests in apptainer hpc --- python/TestHarness/testers/PythonUnitTest.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/python/TestHarness/testers/PythonUnitTest.py b/python/TestHarness/testers/PythonUnitTest.py index 645c3bda8432..7bb888ac2f44 100644 --- a/python/TestHarness/testers/PythonUnitTest.py +++ b/python/TestHarness/testers/PythonUnitTest.py @@ -57,3 +57,15 @@ def checkRunnable(self, options): return False return super().checkRunnable(options) + + def getProcs(self, options): + procs = super().getProcs(options) + # If we start within a script within apptainer and then call mpiexec on HPC, + # it will not work because the mpiexec call needs to be outside of the apptainer + # call. So, limit these tests to 1 proc + if options.pbs and \ + os.environ.get('APPTAINER_CONTAINER') and \ + int(self.specs['min_parallel']) == 1 and procs != 1: + self.addCaveats('hpc apptainer max_cpus=1') + return 1 + return procs From baba7e19b55f7391bfa79d60b3cf8ab92f35f3de Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 14 May 2024 08:37:01 -0600 Subject: [PATCH 052/243] Support SSH disconnections --- python/TestHarness/schedulers/RunPBS.py | 52 +++++++++++++++++-------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 19a0c24d6f03..6d96c4fcf3b1 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -70,24 +70,10 @@ def __init__(self, harness, params): # Setup the remote PBS host, if any (needed when submitted in a container) self.pbs_ssh = None # The lock for calling PBS commands via SSH, if any - self.pbs_ssh_lock = None + self.pbs_ssh_lock = threading.Lock() # Setup the jump host if provided if self.pbs_ssh_host: - # Try to find a key to use - key_filename = None - try: - ssh_config = os.path.expanduser('~/.ssh/config') - config = paramiko.SSHConfig.from_path(ssh_config).lookup(self.pbs_ssh_host) - identityfile = config.get('identityfile') - if identityfile is not None and len(identityfile) > 0: - key_filename = identityfile[-1] - except: - pass - - self.pbs_ssh_lock = threading.Lock() - self.pbs_ssh = paramiko.SSHClient() - self.pbs_ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - self.pbs_ssh.connect(self.pbs_ssh_host, key_filename=key_filename) + self._connectSSH() # Load the PBS template template_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'pbs_template') @@ -110,6 +96,31 @@ def __init__(self, harness, params): if self.options.queue_source_command: self.source_contents = open(self.options.queue_source_command, 'r').read() + def _connectSSH(self): + """ + Connects to the PBS SSH host. + + This is separate so that if the connection is dropped we can attempt + to connect to it again. + """ + if not self.pbs_ssh_host: + raise Exception('PBS SSH host not configured') + + # Try to find a key to use + key_filename = None + try: + ssh_config = os.path.expanduser('~/.ssh/config') + config = paramiko.SSHConfig.from_path(ssh_config).lookup(self.pbs_ssh_host) + identityfile = config.get('identityfile') + if identityfile is not None and len(identityfile) > 0: + key_filename = identityfile[-1] + except: + pass + + self.pbs_ssh = paramiko.SSHClient() + self.pbs_ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + self.pbs_ssh.connect(self.pbs_ssh_host, key_filename=key_filename) + class CallPBSException(Exception): """Exception class for providing extra context for PBS submission errors""" def __init__(self, run_pbs, description, command, result=None): @@ -129,7 +140,14 @@ def callPBS(self, command): with self.pbs_ssh_lock: try: - _, stdout, stderr = self.pbs_ssh.exec_command(command) + # This inner try is for if the SSH connection has died. + try: + _, stdout, stderr = self.pbs_ssh.exec_command(command) + # Try to reconnect and run again + except paramiko.ssh_exception.SSHException: + self._connectSSH() + _, stdout, stderr = self.pbs_ssh.exec_command(command) + exit_code = stdout.channel.recv_exit_status() result = ''.join(stdout.readlines()) if exit_code != 0: From af4b287c851a0b0136a3d5191837157de10c39f6 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 14 May 2024 10:19:18 -0600 Subject: [PATCH 053/243] Simplify the lock management --- python/TestHarness/JobDAG.py | 6 ++ python/TestHarness/schedulers/Job.py | 7 +- python/TestHarness/schedulers/Scheduler.py | 106 +++++++++++---------- 3 files changed, 70 insertions(+), 49 deletions(-) diff --git a/python/TestHarness/JobDAG.py b/python/TestHarness/JobDAG.py index 8314eb1d2956..027f70defe58 100644 --- a/python/TestHarness/JobDAG.py +++ b/python/TestHarness/JobDAG.py @@ -12,12 +12,14 @@ import pyhit import os import sys +import threading class JobDAG(object): """ Class which builds a Job DAG for use by the Scheduler """ def __init__(self, options): self.__job_dag = dag.DAG() self.__parallel_scheduling = None + self.__j_lock = threading.Lock() self.options = options def _setParallel(self): @@ -36,6 +38,10 @@ def _setParallel(self): return self.__parallel_scheduling + def getLock(self): + """ Return the lock for this test spec (folder of jobs) """ + return self.__j_lock + def canParallel(self): """ Return bool whether or not this group runs in parallel """ return self._setParallel() diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 94e101da6d32..c14843a745ff 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -7,7 +7,7 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html -import itertools, re, os, json, time +import itertools, re, os, json, time, threading from timeit import default_timer as clock from TestHarness.StatusSystem import StatusSystem from TestHarness.FileChecker import FileChecker @@ -50,6 +50,7 @@ class Job(object): def __init__(self, tester, job_dag, options): self.id = next(self.id_iter) self.options = options + self.__j_lock = threading.Lock() self.__tester = tester self.specs = tester.specs self.__job_dag = job_dag @@ -106,6 +107,10 @@ def getID(self): """Returns the unique ID for the job""" return self.id + def getLock(self): + """ Get the lock associated with this job """ + return self.__j_lock + def getUpstreams(self): """ Return a list of all the jobs that needed to be completed before this job """ dag = self.getDAG() diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index 0bf9ca5b167a..60d93921004b 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -89,6 +89,9 @@ def __init__(self, harness, params): # List of lists containing all job objects entering the run_pool self.__dag_bank = [] + # Lock for __job_bank and __dag_bank + self.__bank_lock = threading.Lock() + # Total running Job and Test failures encountered self.__failures = 0 @@ -114,9 +117,6 @@ def __init__(self, harness, params): # Whether or not to enforce the timeout of jobs self.enforce_timeout = True - # The job lock - self.j_lock = None - def availableSlots(self, params): """ Get the number of available slots for processing jobs and @@ -188,16 +188,17 @@ def __sortAndLaunch(self): Sort by largest DAG and launch """ sorted_jobs = sorted(self.__dag_bank, key=lambda x: len(x[1].topological_sort()), reverse=True) - for (jobs, j_dag, j_lock) in sorted_jobs: - self.queueJobs(jobs, j_lock) + for jobs, _ in sorted_jobs: + self.queueJobs(jobs) def setAndOutputJobStatus(self, job, status): """ Sets a Job's status and forces the status to be output asap """ - job.setStatus(status) - job.force_report_status = True - self.status_pool.apply_async(self.jobStatus, (job, None, self.j_lock)) + with job.getLock(): + job.setStatus(status) + job.force_report_status = True + self.handleJobStatus(job) def waitFinish(self): """ @@ -207,9 +208,10 @@ def waitFinish(self): self.__waiting = True try: # wait until there is an error, or job_bank has emptied - while self.__job_bank: - if self.__error_state: - break + while True: + with self.__bank_lock: + if not self.__job_bank: + break sleep(0.1) # Completed all jobs sanity check @@ -239,6 +241,8 @@ def getStatusPoolState(self): def schedule(self, testers): """ Generate and submit a group of testers to a thread pool queue for execution. + + This process is serial. """ # If we are not to schedule any more jobs for some reason, return now if self.__error_state: @@ -247,7 +251,6 @@ def schedule(self, testers): # Instance our job DAG, create jobs, and a private lock for this group of jobs (testers) jobs = JobDAG(self.options) j_dag = jobs.createJobs(testers) - self.j_lock = threading.Lock() # Allow derived schedulers access to the jobs before they launch self.augmentJobs(jobs) @@ -256,16 +259,16 @@ def schedule(self, testers): if j_dag.size() != len(testers): raise SchedulerError('Scheduler was going to run a different amount of testers than what was received (something bad happened)!') - with self.j_lock: - # As testers (jobs) finish, they are removed from job_bank - self.__job_bank.update(j_dag.topological_sort()) - # List of objects relating to eachother (used for thread locking this job group) - self.__dag_bank.append([jobs, j_dag, self.j_lock]) + # Don't need to lock below because this process is serial + # As testers (jobs) finish, they are removed from job_bank + self.__job_bank.update(j_dag.topological_sort()) + # List of objects relating to eachother (used for thread locking this job group) + self.__dag_bank.append([jobs, j_dag]) # Store all scheduled jobs self.__scheduled_jobs.append(j_dag.topological_sort()) - def queueJobs(self, jobs, j_lock): + def queueJobs(self, jobs): """ Determine which queue jobs should enter. Finished jobs are placed in the status pool to be printed while all others are placed in the runner pool to perform work. @@ -273,19 +276,18 @@ def queueJobs(self, jobs, j_lock): A finished job will trigger a change to the Job DAG, which will allow additional jobs to become available and ready to enter the runner pool (dependency jobs). """ - state = self.getStatusPoolState() - with j_lock: + with jobs.getLock(): concurrent_jobs = jobs.getJobsAndAdvance() for job in concurrent_jobs: if job.isFinished(): if not state: - self.status_pool.apply_async(self.jobStatus, (job, jobs, j_lock)) + self.handleJobStatus(job) elif job.isHold(): if not state: job.setStatus(job.queued) - self.run_pool.apply_async(self.runJob, (job, jobs, j_lock)) + self.run_pool.apply_async(self.runJob, (job, jobs,)) def getLoad(self): """ Method to return current load average """ @@ -310,7 +312,7 @@ def getJobSlots(self, job): ran externally.""" return job.getSlots() - def reserveSlots(self, job, j_lock): + def reserveSlots(self, job): """ Method which allocates resources to perform the job. Returns bool if job should be allowed to run based on available resources. @@ -334,25 +336,29 @@ def reserveSlots(self, job, j_lock): # Check for insufficient slots -hard limit (skip this job) elif job_slots > self.available_slots and not self.soft_limit: job.addCaveats('insufficient slots') - with j_lock: + with job.getLock(): job.setStatus(job.skip) if can_run: self.slots_in_use += job_slots return can_run - def handleTimeoutJob(self, job, j_lock): + def handleTimeoutJob(self, job): """ Handle jobs that have timed out """ - with j_lock: + with job.getLock(): if job.isRunning(): job.setStatus(job.timeout, 'TIMEOUT') job.killProcess() - def handleLongRunningJob(self, job, jobs, j_lock): - """ Handle jobs that have not reported in the alotted time """ - self.status_pool.apply_async(self.jobStatus, (job, jobs, j_lock)) + def handleJobStatus(self, job): + """ + Possibly reports a job's status. + + Whether or not it actually gets reported... is not so intuitive. + """ + self.status_pool.apply_async(self.jobStatus, (job,)) - def jobStatus(self, job, jobs, j_lock): + def jobStatus(self, job): """ Instruct the TestHarness to print the status of job. This is a serial threaded operation, so as to prevent clobbering of text being printed @@ -367,12 +373,15 @@ def jobStatus(self, job, jobs, j_lock): # completion as a sanity check). state = self.getStatusPoolState() - if state or job not in self.__job_bank: + if state: return + with self.__bank_lock: + if job not in self.__job_bank: + return # Peform within a try, to allow keyboard ctrl-c try: - with j_lock: + with job.getLock(): # This job is set to force a status force_status = job.force_report_status @@ -402,8 +411,8 @@ def jobStatus(self, job, jobs, j_lock): # adjust the next report time based on delta of last report time adjusted_interval = max(1, self.min_report_time - max(1, clock() - self.last_reported_time)) job.report_timer = threading.Timer(adjusted_interval, - self.handleLongRunningJob, - (job, jobs, j_lock,)) + self.handleJobStatus, + (job,)) job.report_timer.start() return @@ -418,10 +427,11 @@ def jobStatus(self, job, jobs, j_lock): self.__failures += 1 if job.isFinished(): - if job in self.__job_bank: - self.__job_bank.remove(job) - else: - raise SchedulerError('job accountability failure while working with: %s' % (job.getTestName())) + with self.__bank_lock: + if job in self.__job_bank: + self.__job_bank.remove(job) + else: + raise SchedulerError('job accountability failure while working with: %s' % (job.getTestName())) # Max failure threshold reached, begin shutdown if self.maxFailures(): @@ -434,7 +444,7 @@ def jobStatus(self, job, jobs, j_lock): except KeyboardInterrupt: self.killRemaining(keyboard=True) - def runJob(self, job, jobs, j_lock): + def runJob(self, job, jobs): """ Method the run_pool calls when an available thread becomes ready """ # Its possible, the queue is just trying to empty. Allow it to do so # with out generating overhead @@ -443,8 +453,8 @@ def runJob(self, job, jobs, j_lock): try: # see if we have enough slots to start this job - if self.reserveSlots(job, j_lock): - with j_lock: + if self.reserveSlots(job): + with job.getLock(): job.setStatus(job.running) with self.activity_lock: @@ -453,15 +463,15 @@ def runJob(self, job, jobs, j_lock): if self.enforce_timeout: timeout_timer = threading.Timer(float(job.getMaxTime()), self.handleTimeoutJob, - (job, j_lock,)) + (job,)) timeout_timer.start() else: timeout_timer = None if self.report_long_jobs: job.report_timer = threading.Timer(self.min_report_time, - self.handleLongRunningJob, - (job, jobs, j_lock,)) + self.handleJobStatus, + (job,)) job.report_timer.start() else: job.report_timer = None @@ -472,7 +482,7 @@ def runJob(self, job, jobs, j_lock): try: self.run(job) # Hand execution over to derived scheduler except Exception: - with j_lock: + with job.getLock(): job.setStatus(StatusSystem().error, 'JOB EXCEPTION') job.setOutput('Encountered an exception while running Job: %s' % (traceback.format_exc())) @@ -488,7 +498,7 @@ def runJob(self, job, jobs, j_lock): job.report_timer.cancel() # All done - with j_lock: + with job.getLock(): job.setStatus(StatusSystem().finished) with self.activity_lock: @@ -498,12 +508,12 @@ def runJob(self, job, jobs, j_lock): else: # ...currently, place back on hold before placing it back into the queue if not job.isFinished(): - with j_lock: + with job.getLock(): job.setStatus(job.hold) sleep(.1) # Job is done (or needs to re-enter the queue) - self.queueJobs(jobs, j_lock) + self.queueJobs(jobs) except Exception: print('runWorker Exception: %s' % (traceback.format_exc())) From 34ba0cd674237230778467fe89decedeb4a43cb8 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 14 May 2024 10:51:38 -0600 Subject: [PATCH 054/243] Add in a few more locks --- python/TestHarness/schedulers/RunPBS.py | 3 ++- python/TestHarness/schedulers/Scheduler.py | 27 +++++++++++----------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 6d96c4fcf3b1..5e5591406547 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -299,7 +299,8 @@ def submitJob(self, job): raise self.CallPBSException(self, f'qsub has unexpected ID "{job_id}"', qsub_command) # Job has been submitted, so set it as queued - job.addCaveats(job_id) + with job.lock(): + job.addCaveats(job_id) self.setAndOutputJobStatus(job, job.queued) # Setup the job in the status map diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index 60d93921004b..e0a829bc71ae 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -457,6 +457,15 @@ def runJob(self, job, jobs): with job.getLock(): job.setStatus(job.running) + # Setup the long running timer, if any + if self.report_long_jobs: + job.report_timer = threading.Timer(self.min_report_time, + self.handleJobStatus, + (job,)) + job.report_timer.start() + else: + job.report_timer = None + with self.activity_lock: self.__active_jobs.add(job) @@ -468,14 +477,6 @@ def runJob(self, job, jobs): else: timeout_timer = None - if self.report_long_jobs: - job.report_timer = threading.Timer(self.min_report_time, - self.handleJobStatus, - (job,)) - job.report_timer.start() - else: - job.report_timer = None - # We have a try here because we want to explicitly catch things like # python errors in _only_ the Job; exceptions that happen in the Tester # from within the Job will get caught within the Tester @@ -493,12 +494,12 @@ def runJob(self, job, jobs): with self.slot_lock: self.slots_in_use = max(0, self.slots_in_use - self.getJobSlots(job)) - # Stop the long running timer - if job.report_timer: - job.report_timer.cancel() - - # All done with job.getLock(): + # Stop the long running timer + if job.report_timer: + job.report_timer.cancel() + + # All done job.setStatus(StatusSystem().finished) with self.activity_lock: From a588f60cad669c62874f5e1964f914d824da0207 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 14 May 2024 12:07:49 -0600 Subject: [PATCH 055/243] Correct callback --- python/TestHarness/TestHarness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 48def6b17052..5ad5c68fd7b3 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -749,7 +749,7 @@ def cleanup(self): # The TestHarness receives individual jobs out of order (can't realistically use self.test_table) tester_dirs = {} dag_table = [] - for jobs, dag, thread_lock in self.scheduler.retrieveDAGs(): + for jobs, dag in self.scheduler.retrieveDAGs(): original_dag = dag.getOriginalDAG() total_time = float(0.0) tester = None From 6f45da9584eff18d09c1a31d3da699b94ae32044 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 14 May 2024 13:57:16 -0600 Subject: [PATCH 056/243] Read truncated files instead --- python/TestHarness/runners/PBSRunner.py | 81 +++++++++++++++++++------ python/TestHarness/schedulers/RunPBS.py | 3 +- 2 files changed, 62 insertions(+), 22 deletions(-) diff --git a/python/TestHarness/runners/PBSRunner.py b/python/TestHarness/runners/PBSRunner.py index 0fce17d6737d..b94233f9608f 100644 --- a/python/TestHarness/runners/PBSRunner.py +++ b/python/TestHarness/runners/PBSRunner.py @@ -126,7 +126,7 @@ def trySetOutput(self, throw=False): output_file = self.run_pbs.getPBSJobOutputPath(self.job) if os.path.exists(output_file) and os.path.isfile(output_file): try: - self.output = open(output_file, 'r').read() + self.output = self.readTruncated(output_file) # If we can parse the exit code here, do it. Sometimes PBS # will do screwy stuff with not capturing the actual exit code... @@ -204,6 +204,66 @@ def fileIsReady(self, file): return False + @staticmethod + def readTruncated(file, start_lines=1000, end_lines=1000): + """ + Reads a file and truncates it past a certain amount of lines. + """ + with open(file, 'rb') as f: + # Find the end position of the file so that we don't read past + f.seek(0, os.SEEK_END) + total_bytes = f.tell() + + # Read the set of lines + f.seek(0) + head_lines_read = 0 + head = '' + while head_lines_read < start_lines and f.tell() < total_bytes: + head += f.read(1).decode('utf-8') + if len(head) > 1 and head[-1:] == '\n': + head_lines_read += 1 + + # Keep the end of the head position so that we don't read + # backwards past it for the tail + head_pos = f.tell() + + # Seek to the end and start reading ending lines + f.seek(0, os.SEEK_END) + + # Keep reading the ending lines until we've reached the max + # number of lines we want or have reached the head output + tail_lines_read = 0 + tail = [] + while tail_lines_read < end_lines and f.tell() > head_pos: + # Read each character in the line until we reach + # the beginning or a new line + line = [] + while f.tell() > 1: + f.seek(-2, os.SEEK_CUR) + char = f.read(1).decode('utf-8') + if char == '\n' or f.tell() == 0: + break + line.append(char) + + # Append the new read line + line.reverse() + tail.append(''.join(line)) + tail_lines_read += 1 + + # Whether or not we have truncated output + # (have hit the location of the head output) + truncated = f.tell() != head_pos + + # Form the combined output + output = head + if truncated: + output += f'{"#" * 80}\nOUTPUT TRIMMED\n{"#" * 80}\n' + if tail: + tail.reverse() + output += '\n'.join(tail) + + return output + @staticmethod def getLastLine(file): """ @@ -221,25 +281,6 @@ def getLastLine(file): line = f.readline().decode('utf-8') return line, pos - @staticmethod - def removeLastLine(file): - """ - Removes the last line from the given text file. - - Used to remove the terminator that we append to all output - files on the compute host in order to make sure that the - entire output file is synced""" - # stackoverflow.com/questions/1877999/delete-final-line-in-file-with-python - with open(file, "r+", encoding="utf-8") as f: - f.seek(0, os.SEEK_END) - pos = f.tell() - 1 - while pos > 0 and f.read(1) != "\n": - pos -= 1 - f.seek(pos, os.SEEK_SET) - if pos > 0: - f.seek(pos, os.SEEK_SET) - f.truncate() - @staticmethod def isFileBinary(file): """ diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 5e5591406547..6d96c4fcf3b1 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -299,8 +299,7 @@ def submitJob(self, job): raise self.CallPBSException(self, f'qsub has unexpected ID "{job_id}"', qsub_command) # Job has been submitted, so set it as queued - with job.lock(): - job.addCaveats(job_id) + job.addCaveats(job_id) self.setAndOutputJobStatus(job, job.queued) # Setup the job in the status map From d5c28676a2ae3f09d7fdcbd8e38a0c3c316f58d7 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 14 May 2024 14:05:47 -0600 Subject: [PATCH 057/243] Print out the escaped command in the header --- python/TestHarness/schedulers/RunPBS.py | 4 ++++ python/TestHarness/schedulers/pbs_template | 1 + 2 files changed, 5 insertions(+) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 6d96c4fcf3b1..0471c9c52693 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -223,7 +223,10 @@ def submitJob(self, job): root_path = os.path.abspath(tester.getTestDir()).split(os.path.sep)[1] full_command += f'apptainer exec -B /{root_path} {APPTAINER_CONTAINER} ' + # Build the full command full_command += command + # Build an escaped command so that we can print it out in the header + escaped_command = json.dumps(full_command) num_procs = tester.getProcs(options) num_threads = tester.getThreads(options) @@ -251,6 +254,7 @@ def submitJob(self, job): 'SUBMITTED_HOSTNAME': socket.gethostname(), 'CWD': tester.getTestDir(), 'COMMAND': full_command, + 'ESCAPED_COMMAND': escaped_command, 'ENDING_COMMENT': self.getOutputEndingComment(), 'MOOSE_PYTHONPATH': moose_python, 'OUTPUT_FILES': output_files} diff --git a/python/TestHarness/schedulers/pbs_template b/python/TestHarness/schedulers/pbs_template index a1bf6612aa36..890d397c5010 100644 --- a/python/TestHarness/schedulers/pbs_template +++ b/python/TestHarness/schedulers/pbs_template @@ -25,6 +25,7 @@ export PYTHONPATH={{ MOOSE_PYTHONPATH }}:${PYTHONPATH} echo "Time: $(date)" echo "Test: {{ TEST_NAME }}" echo "Directory: {{ CWD }}" +echo "Command: {{ ESCAPED_COMMAND }}" echo "Hostname: $(hostname)" echo "Submitted hostname: {{ SUBMITTED_HOSTNAME }}" echo "Output: {{ OUTPUT }}" From 8cc830b751f330ae2fbb5285cf5bff6fbe29a1e5 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 14 May 2024 16:03:24 -0600 Subject: [PATCH 058/243] Separate out PBS components --- python/TestHarness/TestHarness.py | 4 +- .../runners/{PBSRunner.py => HPCRunner.py} | 122 +++--- python/TestHarness/schedulers/RunHPC.py | 319 ++++++++++++++ python/TestHarness/schedulers/RunPBS.py | 409 ++++-------------- .../{pbs_source_apptainer => hpc_source} | 0 python/TestHarness/schedulers/pbs_template | 4 +- python/TestHarness/testers/RunApp.py | 8 + python/TestHarness/testers/Tester.py | 10 + 8 files changed, 488 insertions(+), 388 deletions(-) rename python/TestHarness/runners/{PBSRunner.py => HPCRunner.py} (89%) create mode 100644 python/TestHarness/schedulers/RunHPC.py rename python/TestHarness/schedulers/{pbs_source_apptainer => hpc_source} (100%) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 5ad5c68fd7b3..3eca6d717041 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -1062,10 +1062,10 @@ def parseCLArgs(self, argv): queuegroup = parser.add_argument_group('Queue Options', 'Options controlling which queue manager to use') queuegroup.add_argument('--pbs', action='store_true', dest='pbs', help='Launch tests using PBS as your scheduler') - queuegroup.add_argument('--pbs-pre-source', nargs=1, action="store", dest='queue_source_command', metavar='', help='Source specified file before launching tests') queuegroup.add_argument('--pbs-project', nargs=1, action='store', dest='queue_project', type=str, default='moose', metavar='', help='Identify your job(s) with this project (default: %(default)s)') queuegroup.add_argument('--pbs-queue', nargs=1, action='store', dest='queue_queue', type=str, metavar='', help='Submit jobs to the specified queue') - queuegroup.add_argument('--pbs-host', nargs=1, action='store', dest='queue_host', metavar='', help='The PBS host to use for submitting jobs') + queuegroup.add_argument('--hpc-host', nargs=1, action='store', dest='queue_host', metavar='', help='The host to use for submitting HPC jobs') + queuegroup.add_argument('--hpc-pre-source', nargs=1, action="store", dest='queue_source_command', metavar='', help='Source specified file before launching HPC tests') code = True if self.code.decode() in argv: diff --git a/python/TestHarness/runners/PBSRunner.py b/python/TestHarness/runners/HPCRunner.py similarity index 89% rename from python/TestHarness/runners/PBSRunner.py rename to python/TestHarness/runners/HPCRunner.py index b94233f9608f..da056cdac130 100644 --- a/python/TestHarness/runners/PBSRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -10,14 +10,15 @@ from TestHarness.runners.Runner import Runner import re, time, os, subprocess -class PBSRunner(Runner): - """Runner that spawns a process with PBS. - - To be used with the RunPBS scheduler. +class HPCRunner(Runner): + """ + Base Runner to be used with HPC schedulers (PBS, slurm) """ - def __init__(self, job, options, run_pbs): - Runner.__init__(self, job, options) - self.run_pbs = run_pbs + def __init__(self, job, options, run_hpc): + super().__init__(job, options) + + # The RunHPC object + self.run_hpc = run_hpc # Number of seconds to try to wait for the output # We don't want to wait forever for output because @@ -27,33 +28,29 @@ def __init__(self, job, options, run_pbs): self.wait_output_time = 120 def spawn(self, timer): - from TestHarness.schedulers.RunPBS import RunPBS - - # Submit the job - self.run_pbs.submitJob(self.job) - + self.run_hpc.submitJob(self.job) timer.start() def wait(self, timer): # Need to import here to avoid cyclic includes - from TestHarness.schedulers.RunPBS import RunPBS + from TestHarness.schedulers.RunHPC import RunHPC # Poll loop waiting for the job to be finished # This gets a structure that represents the job, and the - # polling itself is only done on occasion within RunPBS + # polling itself is only done on occasion within RunHPC while True: time.sleep(1) - pbs_job = self.run_pbs.getPBSJob(self.job) + hpc_job = self.run_hpc.getHPCJob(self.job) # We're done - if pbs_job.done: - self.exit_code = pbs_job.exit_code + if hpc_job.done: + self.exit_code = hpc_job.exit_code break timer.stop() # The PBS output (stdout+stderr) - output_file = self.run_pbs.getPBSJobOutputPath(self.job) + output_file = self.run_hpc.getHPCJobOutputPath(self.job) # If the Job is already finished, something happened in PBS # so we have an invalid state for processing in the Tester @@ -70,7 +67,7 @@ def wait(self, timer): # We've actually ran something now and not just qsub, so update the # command to what was ran there - tester.setCommandRan(pbs_job.command) + tester.setCommandRan(hpc_job.command) # Determine the output files that we need to wait for to be complete wait_files = set([output_file]) @@ -118,19 +115,25 @@ def print_files(files, type): waited_time += file_poll_interval time.sleep(file_poll_interval) + def kill(self): + self.run_hpc.killJob(self.job) + def trySetOutput(self, throw=False): if self.output is None: self.output = '' - # TODO: shorten output as an option? - output_file = self.run_pbs.getPBSJobOutputPath(self.job) + output_file = self.run_hpc.getHPCJobOutputPath(self.job) if os.path.exists(output_file) and os.path.isfile(output_file): try: - self.output = self.readTruncated(output_file) + # If we're trying to read output, we can't truncate it + if self.job.getTester().needFullOutput(self.options): + self.output = open(output_file, 'r').read() + else: + self.output = self.readTruncated(output_file) # If we can parse the exit code here, do it. Sometimes PBS # will do screwy stuff with not capturing the actual exit code... - find_exit_code = re.search('Completed TestHarness RunPBS test execution; exit code = (\d+)', self.output) + find_exit_code = re.search('Completed TestHarness RunHPC test execution; exit code = (\d+)', self.output) if find_exit_code: self.exit_code = int(find_exit_code.group(1)) except: @@ -157,7 +160,7 @@ def fileIsReady(self, file): if is_binary is None: return False - ending_comment = self.run_pbs.getOutputEndingComment() + ending_comment = self.run_hpc.getOutputEndingComment() # Binary file if is_binary: @@ -204,6 +207,40 @@ def fileIsReady(self, file): return False + @staticmethod + def isFileBinary(file): + """ + Returns whether or not the given file is a binary file. + + If None, a failure was encountered when checking the file type. + """ + try: + call_file = subprocess.check_output(['file', '--mime-encoding', file], text=True) + except: + return None + + # Will return something like ": ", + # where =binary when the file is binary + find_binary = re.search('binary$', call_file) + return find_binary is not None + + @staticmethod + def getLastLine(file): + """ + Gets the last line of a text file and the position + in the file at which that last line is. + """ + with open(file, 'rb') as f: + try: + f.seek(-2, os.SEEK_END) + while f.read(1) != b'\n': + f.seek(-2, os.SEEK_CUR) + except OSError: # one line filecd + f.seek(0) + pos = f.tell() + line = f.readline().decode('utf-8') + return line, pos + @staticmethod def readTruncated(file, start_lines=1000, end_lines=1000): """ @@ -263,40 +300,3 @@ def readTruncated(file, start_lines=1000, end_lines=1000): output += '\n'.join(tail) return output - - @staticmethod - def getLastLine(file): - """ - Gets the last line of a text file and the position - in the file at which that last line is. - """ - with open(file, 'rb') as f: - try: - f.seek(-2, os.SEEK_END) - while f.read(1) != b'\n': - f.seek(-2, os.SEEK_CUR) - except OSError: # one line filecd - f.seek(0) - pos = f.tell() - line = f.readline().decode('utf-8') - return line, pos - - @staticmethod - def isFileBinary(file): - """ - Returns whether or not the given file is a binary file. - - If None, a failure was encountered when checking the file type. - """ - try: - call_file = subprocess.check_output(['file', '--mime-encoding', file], text=True) - except: - return None - - # Will return something like ": ", - # where =binary when the file is binary - find_binary = re.search('binary$', call_file) - return find_binary is not None - - def kill(self): - self.run_pbs.killJob(self.job) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py new file mode 100644 index 000000000000..8adb04880450 --- /dev/null +++ b/python/TestHarness/schedulers/RunHPC.py @@ -0,0 +1,319 @@ +#* This file is part of the MOOSE framework +#* https://www.mooseframework.org +#* +#* All rights reserved, see COPYRIGHT for full restrictions +#* https://github.com/idaholab/moose/blob/master/COPYRIGHT +#* +#* Licensed under LGPL 2.1, please see LICENSE for details +#* https://www.gnu.org/licenses/lgpl-2.1.html + +from RunParallel import RunParallel +import threading, os, re, sys, datetime, json +import paramiko +from timeit import default_timer as clock + +class RunHPC(RunParallel): + """ + Base scheduler for jobs that are ran on HPC. + """ + def __init__(self, harness, params): + super().__init__(harness, params) + + self.params = params + self.options = harness.getOptions() + + # We don't want to report long running jobs here because we will + # manually set jobs as RUNNING as we notice their HPC status change + self.report_long_jobs = False + # We don't want to enforce the timeout here because we don't want to + # check it while the jobs are queued and HPC itself will handle the + # timeout because the job itself will be forcefully killed by HPC + self.enforce_timeout = False + + # Lock for accessing self.hpc_jobs + self.hpc_jobs_lock = threading.Lock() + # The last time statues were updated in getHPCJob() (if any) + self.hpc_jobs_status_timer = None + # How often to poll for status updates in getHPCJob() + self.hpc_jobs_update_interval = 10 + # Map of Job -> HPCJob + self.hpc_jobs = {} + + # The jump hostname for running commands, if any + self.ssh_host = self.options.queue_host + # Setup the remote HPC host, if any (needed when submitted in a container) + self.ssh = None + # The lock for calling commands via SSH, if any + self.ssh_lock = threading.Lock() + # Setup the jump host if provided + if self.ssh_host: + self._connectSSH() + + if os.environ.get('APPTAINER_CONTAINER'): + if not self.ssh_host: + print('ERROR: --hpc-host must be set when using HPC jobs within apptainer') + sys.exit(1) + if not self.options.queue_source_command: + default_pre_source = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'hpc_source') + self.options.queue_source_command = default_pre_source + print(f'INFO: Setting --hpc-pre-source={default_pre_source}') + + if self.options.queue_source_command and not os.path.exists(self.options.queue_source_command): + print(f'ERROR: --hpc-pre-source path {self.options.queue_source_command} does not exist') + sys.exit(1) + + # Load the pre-source if it exists + self.source_contents = None + if self.options.queue_source_command: + self.source_contents = open(self.options.queue_source_command, 'r').read() + + class HPCJob: + """ + Structure that represents the cached information about an HPC job + """ + def __init__(self, id, command): + # The job identifier + self.id = id + # Whether or not this job is done; here done doesn't mean if it + # was successful or not, just if it is not running/queued anymore + self.done = False + # The exit code of the command that was ran (if any) + self.exit_code = None + # The command that was ran within the submission script + self.command = command + # Whether or not this job was killed; used so what we don't + # bother killing a job multiple times + self.killed = False + # The job state as defined by PBS + self.state = None + + class CallHPCException(Exception): + """ + Exception class for providing extra context for HPC submission errors + """ + def __init__(self, run_hpc, description, command, result=None): + message = f'{description}' + if run_hpc.ssh: + message += f' on host "{run_hpc.ssh_host}"' + message += f'\nCommand: {command}' + if result: + message += f'\n\nResult:\n{result}' + super().__init__(message) + + def _connectSSH(self): + """ + Connects to the HPC SSH host. + + This is separate so that if the connection is dropped we can attempt + to connect to it again. + """ + if not self.ssh_host: + raise Exception('SSH host not configured') + + # Try to find a key to use + key_filename = None + try: + ssh_config = os.path.expanduser('~/.ssh/config') + config = paramiko.SSHConfig.from_path(ssh_config).lookup(self.ssh_host) + identityfile = config.get('identityfile') + if identityfile is not None and len(identityfile) > 0: + key_filename = identityfile[-1] + except: + pass + + self.ssh = paramiko.SSHClient() + self.ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + self.ssh.connect(self.ssh_host, key_filename=key_filename) + + def callHPC(self, command): + """Wrapper for calling a HPC command (qsub, qstat, etc) that supports + SSH-ing to another host as needed when calling from within apptainer""" + if not self.ssh: + raise Exception('HPC not currently supported outside of a container') + + with self.ssh_lock: + try: + # This inner try is for if the SSH connection has died. + try: + _, stdout, stderr = self.ssh.exec_command(command) + # Try to reconnect and run again + except paramiko.ssh_exception.SSHException: + self._connectSSH() + _, stdout, stderr = self.ssh.exec_command(command) + + exit_code = stdout.channel.recv_exit_status() + result = ''.join(stdout.readlines()) + if exit_code != 0: + result += ''.join(stderr.readlines()) + except Exception as e: + raise RunHPC.CallHPCException(self, 'Failed to execute remote command', command) from e + return exit_code, result.rstrip() + + def getJobSlots(self, job): + # Jobs only use one slot because they are ran externally + return 1 + + def availableSlots(self, params): + # Support managing 250 HPC jobs concurrently + return 250, False + + class JobData: + """ + Helper struct for storing the information to generate a job + """ + def __init__(self): + self.command = None + self.name = None + self.num_procs = None + self.num_threads = None + self.output_file = None + self.output_files = None + self.submission_file = None + self.walltime = None + + def submitJob(self, job): + """ + Method for submitting an HPC job for the given Job. + + Should be overridden. + """ + tester = job.getTester() + options = self.options + + job_data = self.JobData() + + # The submission script we're going to write to + job_data.submission_file = self.getHPCJobSubmissionPath(job) + # The combined stdout+stderr from the job + job_data.output_file = self.getHPCJobOutputPath(job) + # Clean these two files + for file in [job_data.submission_file, job_data.output_file]: + if os.path.exists(file): + os.remove(file) + + # Set up the command. We have special logic here for when we're using apptainer, + # where we need to put the MPI command outside of the apptainer call + full_command = '' + command = tester.getCommand(options) + mpi_command = self.parseMPICommand(command) + if mpi_command: + command = command.replace(mpi_command, '') + full_command += mpi_command + # Split out whitespace in the command and then use json dumps to + # escape quoted characters + command = json.dumps(command.replace('\n', ' ')) + + # Wrap the command with apptainer if we're in a container, and also bind + # in the root directory that the test is contained in + APPTAINER_CONTAINER = os.environ.get('APPTAINER_CONTAINER') + if APPTAINER_CONTAINER: + root_path = os.path.abspath(tester.getTestDir()).split(os.path.sep)[1] + full_command += f'apptainer exec -B /{root_path} {APPTAINER_CONTAINER} ' + full_command += command + + job_data.command = full_command + job_data.name = self.getHPCJobName(job) + job_data.num_procs = tester.getProcs(options) + job_data.num_threads = tester.getThreads(options) + job_data.walltime = str(datetime.timedelta(seconds=tester.getMaxTime())) + + # The output files that we're expected to generate so that the + # HPC job can add a terminator for them so that we can verify + # they are complete on the executing host + job_data.output_files = [] + for file in tester.getOutputFiles(options): + job_data.output_files.append(f'"{os.path.join(tester.getTestDir(), file)}"') + job_data.output_files = ' '.join(job_data.output_files) + + # Let the derived class actually submit the job + job_id = self._submitJob(job, job_data) + + # Job has been submitted, so set it as queued + job.addCaveats(job_id) + self.setAndOutputJobStatus(job, job.queued) + + # Setup the job in the status map + with self.hpc_jobs_lock: + if job in self.hpc_jobs: + raise Exception('Job has already been submitted') + self.hpc_jobs[job] = self.HPCJob(job_id, job_data.command) + + def _submitJob(self, job, job_data): + """ + Submits a given job. + + Should be overridden. This is where the derived classes + will specialize how to submit the job. + """ + raise Exception('Unimplemented createJobScript()') + + def getHPCJob(self, job): + """ + Gets the HPCJob object given a Job + + This will periodically update statues given a timer. + """ + with self.hpc_jobs_lock: + # If this is the first time seeing this job, initialize it in the list + if job not in self.hpc_jobs: + raise Exception('Failed to get status for unsubmitted job') + + # Only update the statues periodically as this is called across threads + if self.hpc_jobs_status_timer is None or ((clock() - self.hpc_jobs_status_timer) > self.hpc_jobs_update_interval): + self.updateJobs() + self.hpc_jobs_status_timer = clock() + + return self.hpc_jobs.get(job) + + def updateJobs(self): + """ + Updates the underlying jobs. + + Should be overridden. + """ + raise Exception('Unimplemented updateJobs()') + + def buildRunner(self, job, options): + from TestHarness.runners.HPCRunner import HPCRunner + return HPCRunner(job, options, self) + + @staticmethod + def getHPCJobName(job) -> str: + """Gets the name of the HPC job given a tester + + PBS doesn't like ":" or "/", hence changing them to "." + """ + return job.getTestName().replace(':', '.').replace('/', '.') + + @staticmethod + def getHPCJobOutputPathPrefix(job): + """Gets the absolute path prefix for a HPC job""" + return os.path.join(job.getTestDir(), "pbs_" + job.getTestNameShort().replace('/', '.')) + + @staticmethod + def getHPCJobOutputPath(job): + """Gets the absolute path for stdout/stderr for a HPC job""" + return RunHPC.getHPCJobOutputPathPrefix(job) + '.out' + + @staticmethod + def getHPCJobSubmissionPath(job): + """Gets the aboslute path for the qsub script for a HPC job""" + return RunHPC.getHPCJobOutputPathPrefix(job) + '.qsub' + + @staticmethod + def getOutputEndingComment() -> str: + """ + Gets the text we append to the stderr+stdout file to desginate + that it is complete + """ + return 'TESTHARNESS RUNHPC FILE TERMINATOR' + + @staticmethod + def parseMPICommand(command) -> str: + """ + Helper that splits out the mpi command from a given command, if any + """ + find_mpi = re.search('^(\s+)?(mpiexec|mpirun)(\s+-(n|np)\s+\d+)?(\s+)?', command) + if find_mpi is not None: + return find_mpi.group(0) + return None diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 0471c9c52693..696768cd909e 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -9,259 +9,55 @@ import os, sys, re, json, socket, datetime, threading from RunParallel import RunParallel -from TestHarness.runners.PBSRunner import PBSRunner +from RunHPC import RunHPC from timeit import default_timer as clock from PBScodes import * -import paramiko import jinja2 ## This Class is responsible for maintaining an interface to the PBS scheduling syntax -class RunPBS(RunParallel): +class RunPBS(RunHPC): + """ + Scheduler for HPC jobs that run with PBS. + """ @staticmethod def validParams(): params = RunParallel.validParams() params.addParam('queue_template', os.path.join(os.path.abspath(os.path.dirname(__file__)), 'pbs_template'), "Location of the PBS template") return params - class PBSJob: - """ - Structure that represents the cached information about a PBS job - """ - def __init__(self, id, command): - # The PBS job identifier - self.id = id - # Whether or not this job is done; here done doesn't mean if it - # was successful or not, just if it is not running/queued anymore - self.done = False - # The exit code of the command that was ran (if any) - self.exit_code = None - # The job state as defined by PBS - self.state = None - # The command that was ran within the qsub script - self.command = command - # Whether or not this job was killed; used so what we don't - # bother killing a job multiple times - self.killed = False - def __init__(self, harness, params): - RunParallel.__init__(self, harness, params) - self.params = params - self.options = harness.getOptions() - - # We don't want to report long running jobs here because we will - # manually set jobs as RUNNING as we notice their PBS status change - self.report_long_jobs = False - # We don't want to enforce the timeout here because we don't want to - # check it while the jobs are queued and PBS itself will handle the - # timeout because the job itself will be forcefully killed by PBS - self.enforce_timeout = False - - # Lock for accessing self.pbs_jobs - self.pbs_jobs_lock = threading.Lock() - # The last time statues were updated in getPBSJob() (if any) - self.pbs_jobs_status_timer = None - # How often to poll for status updates in getPBSJob() - self.pbs_jobs_update_interval = 10 - # Map of Job -> PBSJob - self.pbs_jobs = {} - - # The jump hostname for running PBS commands, if any - self.pbs_ssh_host = self.options.queue_host - # Setup the remote PBS host, if any (needed when submitted in a container) - self.pbs_ssh = None - # The lock for calling PBS commands via SSH, if any - self.pbs_ssh_lock = threading.Lock() - # Setup the jump host if provided - if self.pbs_ssh_host: - self._connectSSH() + super().__init__(harness, params) # Load the PBS template template_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'pbs_template') self.default_template = open(template_path, 'r').read() - if os.environ.get('APPTAINER_CONTAINER'): - if not self.pbs_ssh_host: - print('ERROR: --pbs-host must be set when using --pbs within apptainer') - sys.exit(1) - if not self.options.queue_source_command: - default_pre_source = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'pbs_source_apptainer') - self.options.queue_source_command = default_pre_source - print(f'INFO: Setting --pbs-pre-source={default_pre_source}') - if self.options.queue_source_command and not os.path.exists(self.options.queue_source_command): - print(f'ERROR: --pbs-pre-source path {self.options.queue_source_command} does not exist') - sys.exit(1) - - # Load the pre-source if it exists - self.source_contents = None - if self.options.queue_source_command: - self.source_contents = open(self.options.queue_source_command, 'r').read() - - def _connectSSH(self): - """ - Connects to the PBS SSH host. - - This is separate so that if the connection is dropped we can attempt - to connect to it again. - """ - if not self.pbs_ssh_host: - raise Exception('PBS SSH host not configured') - - # Try to find a key to use - key_filename = None - try: - ssh_config = os.path.expanduser('~/.ssh/config') - config = paramiko.SSHConfig.from_path(ssh_config).lookup(self.pbs_ssh_host) - identityfile = config.get('identityfile') - if identityfile is not None and len(identityfile) > 0: - key_filename = identityfile[-1] - except: - pass - - self.pbs_ssh = paramiko.SSHClient() - self.pbs_ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - self.pbs_ssh.connect(self.pbs_ssh_host, key_filename=key_filename) - - class CallPBSException(Exception): - """Exception class for providing extra context for PBS submission errors""" - def __init__(self, run_pbs, description, command, result=None): - message = f'{description}' - if run_pbs.pbs_ssh: - message += f' on host "{run_pbs.pbs_ssh_host}"' - message += f'\nCommand: {command}' - if result: - message += f'\n\nResult:\n{result}' - super().__init__(message) - - def callPBS(self, command): - """Wrapper for calling a PBS command (qsub, qstat, etc) that supports - SSH-ing to another host as needed when calling from within apptainer""" - if not self.pbs_ssh: - raise Exception('PBS not currently supported outside of a container') - - with self.pbs_ssh_lock: - try: - # This inner try is for if the SSH connection has died. - try: - _, stdout, stderr = self.pbs_ssh.exec_command(command) - # Try to reconnect and run again - except paramiko.ssh_exception.SSHException: - self._connectSSH() - _, stdout, stderr = self.pbs_ssh.exec_command(command) - - exit_code = stdout.channel.recv_exit_status() - result = ''.join(stdout.readlines()) - if exit_code != 0: - result += ''.join(stderr.readlines()) - except Exception as e: - raise RunPBS.CallPBSException(self, 'Failed to execute remote PBS command', command) from e - return exit_code, result.rstrip() - - def getJobSlots(self, job): - # Jobs only use one slot because they are ran externally - return 1 - - def availableSlots(self, params): - # Support managing 250 HPC jobs concurrently - return 250, False - - def getPBSJobName(self, job): - """Gets the name of the PBS job given a tester - - PBS doesn't like ":" or "/", hence changing them to "." - """ - return job.getTestName().replace(':', '.').replace('/', '.') - - def getPBSJobOutputPathPrefix(self, job): - """Gets the absolute path prefix for a PBS job""" - return os.path.join(job.getTestDir(), "pbs_" + job.getTestNameShort().replace('/', '.')) - - def getPBSJobOutputPath(self, job): - """Gets the absolute path for stdout/stderr for a PBS job""" - return self.getPBSJobOutputPathPrefix(job) + '.out' - - def getPBSJobSubmissionPath(self, job): - """Gets the aboslute path for the qsub script for a PBS job""" - return self.getPBSJobOutputPathPrefix(job) + '.qsub' - - @staticmethod - def parseMPICommand(command): - find_mpi = re.search('^(\s+)?(mpiexec|mpirun)(\s+-(n|np)\s+\d+)?(\s+)?', command) - if find_mpi is not None: - return find_mpi.group(0) - return None - - def submitJob(self, job): - """Submits a PBS job""" + def _submitJob(self, job, job_data): tester = job.getTester() options = self.options - # The qsub script we're going to write to - qsub_file = self.getPBSJobSubmissionPath(job) - # The combined stdout+stderr from the PBS job - output_file = self.getPBSJobOutputPath(job) - # Clean these two files - for file in [qsub_file, output_file]: - if os.path.exists(file): - os.remove(file) - - # Set up the command. We have special logic here for when we're using apptainer, - # where we need to put the MPI command outside of the apptainer call - full_command = '' - command = tester.getCommand(options) - mpi_command = self.parseMPICommand(command) - if mpi_command: - command = command.replace(mpi_command, '') - full_command += mpi_command - # Split out whitespace in the command and then use json dumps to - # escape quoted characters - command = json.dumps(command.replace('\n', ' ')) - - # Wrap the command with apptainer if we're in a container, and also bind - # in the root directory that the test is contained in - APPTAINER_CONTAINER = os.environ.get('APPTAINER_CONTAINER') - if APPTAINER_CONTAINER: - root_path = os.path.abspath(tester.getTestDir()).split(os.path.sep)[1] - full_command += f'apptainer exec -B /{root_path} {APPTAINER_CONTAINER} ' - - # Build the full command - full_command += command - # Build an escaped command so that we can print it out in the header - escaped_command = json.dumps(full_command) - - num_procs = tester.getProcs(options) - num_threads = tester.getThreads(options) - walltime = str(datetime.timedelta(seconds=tester.getMaxTime())) - # Add MOOSE's python path for python scripts moose_python = os.path.abspath(os.path.join(os.path.abspath(os.path.dirname(__file__)), '../..')) - # The output files that we're expected to generate so that the - # PBS job can add a terminator for them so that we can verify - # they are complete on the executing host - output_files = [] - for file in tester.getOutputFiles(options): - output_files.append(f'"{os.path.join(tester.getTestDir(), file)}"') - output_files = ' '.join(output_files) - # Set up the template - template_env = {'NAME': self.getPBSJobName(job), - 'SELECT': f'{num_procs}:mpiprocs=1:ncpus={num_threads}', - 'WALLTIME': walltime, + template_env = {'NAME': job_data.name, + 'SELECT': f'{job_data.num_procs}:mpiprocs=1:ncpus={job_data.num_threads}', + 'WALLTIME': job_data.walltime, 'PROJECT': self.options.queue_project, - 'OUTPUT': output_file, + 'OUTPUT': job_data.output_file, 'PLACE': 'scatter', 'TEST_NAME': tester.getTestName(), 'SUBMITTED_HOSTNAME': socket.gethostname(), 'CWD': tester.getTestDir(), - 'COMMAND': full_command, - 'ESCAPED_COMMAND': escaped_command, + 'COMMAND': job_data.command, + 'ESCAPED_COMMAND': json.dumps(job_data.command), 'ENDING_COMMENT': self.getOutputEndingComment(), 'MOOSE_PYTHONPATH': moose_python, - 'OUTPUT_FILES': output_files} + 'OUTPUT_FILES': job_data.output_files} if self.options.queue_queue: - template_env['QUEUE'] = self.options.queue_queue + template_env['QUEUE'] = options.queue_queue if self.options.queue_source_command: - template_env['SOURCE_FILE'] = self.options.queue_source_command + template_env['SOURCE_FILE'] = options.queue_source_command if self.source_contents: template_env['SOURCE_CONTENTS'] = self.source_contents @@ -273,145 +69,112 @@ def submitJob(self, job): script = definition_template.render(**template_env) # Write the script - open(qsub_file, 'w').write(script) + open(job_data.submission_file, 'w').write(script) # qsub submission command qsub_command = [f'cd {tester.getTestDir()}'] - qsub_command += [f'qsub {qsub_file}'] + qsub_command += [f'qsub {job_data.submission_file}'] qsub_command = '; '.join(qsub_command) # Set what we've ran for this job so that we can # potentially get the context in an error command_ran = qsub_command - if self.pbs_ssh: - command_ran = f"ssh {self.pbs_ssh_host} '{qsub_command}'" + if self.ssh: + command_ran = f"ssh {self.ssh_host} '{qsub_command}'" job.getTester().setCommandRan(command_ran) # Do the submission; this is thread safe # Eventually we might want to make this a pool so we can submit multiple # jobs at the same time - exit_code, result = self.callPBS(qsub_command) + exit_code, result = self.callHPC(qsub_command) # Nonzero return code if exit_code != 0: - raise self.CallPBSException(self, 'qsub failed', qsub_command, result) + raise self.CallHPCException(self, 'qsub failed', qsub_command, result) # Make sure the job ID is something we'd expect job_id = result search = re.search('^[0-9]+.[a-zA-Z0-9_-]+$', job_id) if not search: - raise self.CallPBSException(self, f'qsub has unexpected ID "{job_id}"', qsub_command) + raise self.CallHPCException(self, f'qsub has unexpected ID "{job_id}"', qsub_command) - # Job has been submitted, so set it as queued - job.addCaveats(job_id) - self.setAndOutputJobStatus(job, job.queued) + return job_id - # Setup the job in the status map - with self.pbs_jobs_lock: - if job in self.pbs_jobs: - raise Exception('Job has already been submitted') - self.pbs_jobs[job] = self.PBSJob(job_id, full_command) + def updateJobs(self): + # Obtain the IDs of jobs that are active that we need to poll for + active_job_ids = [] + for job, pbs_job in self.hpc_jobs.items(): + if not pbs_job.done: + active_job_ids.append(pbs_job.id) + + # Poll for all of the jobs within a single call + cmd = ['qstat', '-xf', '-F', 'json'] + active_job_ids + exit_code, result = self.callHPC(' '.join(cmd)) + if exit_code != 0: + raise self.CallHPCException(self, 'Failed to get job status', cmd, result) + + # Attempt to parse the status from the jobs + try: + json_result = json.loads(result) + job_results = json_result['Jobs'] + + for job, pbs_job in self.hpc_jobs.items(): + # We're only updating jobs that aren't done yet + if pbs_job.done: + continue + + # This job's result from the qstat command + job_result = job_results[pbs_job.id] + exit_code = job_result.get('Exit_status') + if exit_code is not None: + exit_code = int(exit_code) + state = job_result.get('job_state') + substate = job_result.get('substate') + terminated = int(substate) == 91 if substate else False + done = exit_code is not None or terminated + + # Get the job state, and report running if it switched to running + if state == 'R' and pbs_job.state != 'R': + self.setAndOutputJobStatus(job, job.running) + + # Update the PBSJob structure + pbs_job.done = done + pbs_job.exit_code = exit_code + pbs_job.state = state + + # Mark the job as terminated (past walltime, over resources, killed) + if terminated: + job.setStatus(job.error, 'PBS JOB TERMINATED') + except Exception as e: + raise self.CallHPCException(self, f'Failed to parse collective job status', cmd, result) from e def killJob(self, job): """Kills a PBS job""" - with self.pbs_jobs_lock: - if job not in self.pbs_jobs: + with self.hpc_jobs_lock: + if job not in self.hpc_jobs: return - pbs_job = self.pbs_jobs[job] - if pbs_job.done or pbs_job.killed: + hpc_job = self.hpc_jobs[job] + if hpc_job.done or hpc_job.killed: return - job_id = self.pbs_jobs[job].id + job_id = self.hpc_jobs[job].id # Don't care about whether or not this failed - self.callPBS(f'qdel {job_id}') + self.callHPC(f'qdel {job_id}') def killRemaining(self, keyboard=False): """Kills all currently running PBS jobs""" job_ids = [] - with self.pbs_jobs_lock: - for pbs_job in self.pbs_jobs.values(): - if not pbs_job.done: - job_ids.append(pbs_job.id) + with self.hpc_jobs_lock: + for hpc_job in self.hpc_jobs.values(): + if not hpc_job.done: + job_ids.append(hpc_job.id) # Don't care about whether or not this failed - self.callPBS(f'qdel {" ".join(job_ids)}') + self.callHPC(f'qdel {" ".join(job_ids)}') - with self.pbs_jobs_lock: - for pbs_job in self.pbs_jobs.values(): - if not pbs_job.done: - pbs_job.killed = True + with self.hpc_jobs_lock: + for hpc_job in self.hpc_jobs.values(): + if not hpc_job.done: + hpc_job.killed = True RunParallel.killRemaining(self, keyboard) - - def buildRunner(self, job, options): - return PBSRunner(job, options, self) - - def getOutputEndingComment(self): - """Gets the text we append to the PBS stderr+stdout file to desginate - that it is complete""" - return 'TESTHARNESS RUNPBS FILE TERMINATOR' - - def getPBSJob(self, job): - """Gets the PBSJob object for a given Job - - This will periodically update the PBSJob in a thread safe manner so - that we are not constantly calling qstat for every call.""" - - with self.pbs_jobs_lock: - # If this is the first time seeing this job, initialize it in the list - if job not in self.pbs_jobs: - raise Exception('Failed to get status for unsubmitted job') - - # Only update the statues periodically as this is called across threads - if self.pbs_jobs_status_timer is None or ((clock() - self.pbs_jobs_status_timer) > self.pbs_jobs_update_interval): - # Obtain the IDs of jobs that are active that we need to poll for - active_job_ids = [] - for job, pbs_job in self.pbs_jobs.items(): - if not pbs_job.done: - active_job_ids.append(pbs_job.id) - - # Poll for all of the jobs within a single call - cmd = ['qstat', '-xf', '-F', 'json'] + active_job_ids - exit_code, result = self.callPBS(' '.join(cmd)) - if exit_code != 0: - raise self.CallPBSException(self, 'Failed to get job status', cmd, result) - - # Register that we've updated the status - self.pbs_jobs_status_timer = clock() - - # Attempt to parse the status from the jobs - try: - json_result = json.loads(result) - job_results = json_result['Jobs'] - - for job, pbs_job in self.pbs_jobs.items(): - # We're only updating jobs that aren't done yet - if pbs_job.done: - continue - - # This job's result from the qstat command - job_result = job_results[pbs_job.id] - exit_code = job_result.get('Exit_status') - if exit_code is not None: - exit_code = int(exit_code) - state = job_result.get('job_state') - substate = job_result.get('substate') - terminated = int(substate) == 91 if substate else False - done = exit_code is not None or terminated - - # Get the job state, and report running if it switched to running - if state == 'R' and pbs_job.state != 'R': - self.setAndOutputJobStatus(job, job.running) - - # Update the PBSJob structure - pbs_job.done = done - pbs_job.exit_code = exit_code - pbs_job.state = state - - # Mark the job as terminated (past walltime, over resources, killed) - if terminated: - job.setStatus(job.error, 'PBS JOB TERMINATED') - except Exception as e: - raise self.CallPBSException(self, f'Failed to parse collective job status', cmd, result) from e - - return self.pbs_jobs[job] diff --git a/python/TestHarness/schedulers/pbs_source_apptainer b/python/TestHarness/schedulers/hpc_source similarity index 100% rename from python/TestHarness/schedulers/pbs_source_apptainer rename to python/TestHarness/schedulers/hpc_source diff --git a/python/TestHarness/schedulers/pbs_template b/python/TestHarness/schedulers/pbs_template index 890d397c5010..624e12b81f44 100644 --- a/python/TestHarness/schedulers/pbs_template +++ b/python/TestHarness/schedulers/pbs_template @@ -31,7 +31,7 @@ echo "Submitted hostname: {{ SUBMITTED_HOSTNAME }}" echo "Output: {{ OUTPUT }}" module list echo "################################################################################" -echo "Beginning TestHarness RunPBS test execution" +echo "Beginning TestHarness RunHPC test execution" echo "################################################################################" # Move into the test directory @@ -51,7 +51,7 @@ set -e # We will read this output later on to try to capture the return code # in the event that PBS doesn't get it to us correctly echo "################################################################################" -echo "Completed TestHarness RunPBS test execution; exit code = $return_code" +echo "Completed TestHarness RunHPC test execution; exit code = $return_code" echo "################################################################################" # Append a terminator to all of the output files for file syncing across NFS diff --git a/python/TestHarness/testers/RunApp.py b/python/TestHarness/testers/RunApp.py index 7205fb67ba5f..fe2671aedfb0 100644 --- a/python/TestHarness/testers/RunApp.py +++ b/python/TestHarness/testers/RunApp.py @@ -358,3 +358,11 @@ def mustOutputExist(self): if self.specs['should_crash']: return self.getExitCode() != 0 return self.getExitCode() == 0 + + def needFullOutput(self, options): + # We need the full output when we're trying to read from said output + params = ['expect_err', 'expect_assert', 'expect_out', 'absent_out'] + for param in params: + if self.specs.isValid(param): + return True + return super().needFullOutput(options) diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index 39394c7390f0..6f492858b437 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -803,3 +803,13 @@ def checkRunnableBase(self, options): # Check the return values of the derived classes self._runnable = self.checkRunnable(options) return self._runnable + + def needFullOutput(self, options): + """ + Whether or not the full output is needed. + + If this is True, it means that we cannot truncate + the stderr/stdout output. This is often needed + when we're trying to read something from the output. + """ + return False From acadabac5ca04b2b4e020444030bd86003d245b3 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 14 May 2024 16:25:25 -0600 Subject: [PATCH 059/243] Allow some time for the submission script to appear --- python/TestHarness/schedulers/RunPBS.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 696768cd909e..548cd0ce93fc 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -7,10 +7,9 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html -import os, sys, re, json, socket, datetime, threading +import os, re, json, socket, time from RunParallel import RunParallel from RunHPC import RunHPC -from timeit import default_timer as clock from PBScodes import * import jinja2 @@ -71,9 +70,18 @@ def _submitJob(self, job, job_data): # Write the script open(job_data.submission_file, 'w').write(script) - # qsub submission command - qsub_command = [f'cd {tester.getTestDir()}'] - qsub_command += [f'qsub {job_data.submission_file}'] + # Submission command. Here we have a simple bash loop + # that will try to wait for the file if it doesn't exist yet + qsub_command = [f'cd {tester.getTestDir()}', + f'FILE="{job_data.submission_file}"', + 'for i in {1..40}', + 'do if [ -e "$FILE" ]', + 'then qsub $FILE', + 'exit $?', + 'else sleep 0.25', + 'fi', + 'done', + 'exit 1'] qsub_command = '; '.join(qsub_command) # Set what we've ran for this job so that we can From 4d93f5d991a0c5bab48ed325fe689a16c77e7a49 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 14 May 2024 17:22:37 -0600 Subject: [PATCH 060/243] Setup threaded SSH --- python/TestHarness/schedulers/RunHPC.py | 115 ++++++++++++++---------- python/TestHarness/schedulers/RunPBS.py | 2 +- 2 files changed, 71 insertions(+), 46 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 8adb04880450..4e000533cf3a 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -10,6 +10,7 @@ from RunParallel import RunParallel import threading, os, re, sys, datetime, json import paramiko +from multiprocessing.pool import ThreadPool from timeit import default_timer as clock class RunHPC(RunParallel): @@ -41,13 +42,36 @@ def __init__(self, harness, params): # The jump hostname for running commands, if any self.ssh_host = self.options.queue_host - # Setup the remote HPC host, if any (needed when submitted in a container) - self.ssh = None - # The lock for calling commands via SSH, if any - self.ssh_lock = threading.Lock() + # The SSH key to use for connections + self.ssh_key_filename = None + # The pool of processes for running threaded SSH comments + self.ssh_pool = None + # The threaded SSHClient objects, mapped by thread identifier + self.ssh_clients = None + # The lock for calling commands via SSH, + self.ssh_clients_lock = None # Setup the jump host if provided if self.ssh_host: - self._connectSSH() + self.ssh_pool = ThreadPool(processes=5) + self.ssh_clients = {} + self.ssh_clients_lock = threading.Lock() + + # Try to find a key to use + try: + ssh_config = os.path.expanduser('~/.ssh/config') + config = paramiko.SSHConfig.from_path(ssh_config).lookup(self.ssh_host) + identityfile = config.get('identityfile') + if identityfile is not None and len(identityfile) > 0: + self.ssh_key_filename = identityfile[-1] + except: + pass + + # Make sure that we can connect up front + try: + self.callHPC('hostname') + except: + print(f'Failed to connect to HPC host {self.ssh_host}') + sys.exit(1) if os.environ.get('APPTAINER_CONTAINER'): if not self.ssh_host: @@ -93,61 +117,62 @@ class CallHPCException(Exception): """ def __init__(self, run_hpc, description, command, result=None): message = f'{description}' - if run_hpc.ssh: + if run_hpc.ssh_host: message += f' on host "{run_hpc.ssh_host}"' message += f'\nCommand: {command}' if result: message += f'\n\nResult:\n{result}' super().__init__(message) - def _connectSSH(self): + def _getSSHClient(self, reconnect=False): """ - Connects to the HPC SSH host. + Gets a SSH client owned by a thread. - This is separate so that if the connection is dropped we can attempt - to connect to it again. + This is threaded so that we can operate a few connections at once. """ - if not self.ssh_host: - raise Exception('SSH host not configured') + process = threading.get_ident() + with self.ssh_clients_lock: + if process not in self.ssh_clients or reconnect: + self.ssh_clients[process] = paramiko.SSHClient() + self.ssh_clients[process].set_missing_host_key_policy(paramiko.AutoAddPolicy()) + self.ssh_clients[process].connect(self.ssh_host, key_filename=self.ssh_key_filename) + return self.ssh_clients.get(process) + + def _callSSH(self, command): + """ + Calls a SSH command. - # Try to find a key to use - key_filename = None + Should only be used via apply with the self.ssh_pool. + """ + client = self._getSSHClient() try: - ssh_config = os.path.expanduser('~/.ssh/config') - config = paramiko.SSHConfig.from_path(ssh_config).lookup(self.ssh_host) - identityfile = config.get('identityfile') - if identityfile is not None and len(identityfile) > 0: - key_filename = identityfile[-1] - except: - pass - - self.ssh = paramiko.SSHClient() - self.ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - self.ssh.connect(self.ssh_host, key_filename=key_filename) + _, stdout, stderr = client.exec_command(command) + # SSH connection might have died, so try to create a new one + except paramiko.ssh_exception.SSHException: + try: + client = self._getSSHClient(reconnect=True) + _, stdout, stderr = client.exec_command(command) + except Exception as e: + raise RunHPC.CallHPCException(self, 'Failed to execute remote command', command) from e + # An even worse failure happened here + except Exception as e: + raise RunHPC.CallHPCException(self, 'Failed to execute remote command', command) from e + + exit_code = stdout.channel.recv_exit_status() + result = ''.join(stdout.readlines()) + if exit_code != 0: + result += ''.join(stderr.readlines()) + return exit_code, result.rstrip() def callHPC(self, command): - """Wrapper for calling a HPC command (qsub, qstat, etc) that supports - SSH-ing to another host as needed when calling from within apptainer""" - if not self.ssh: + """ + Wrapper for calling a HPC command (qsub, qstat, etc) that supports + SSH-ing to another host as needed when calling from within apptainer + """ + if not self.ssh_host: raise Exception('HPC not currently supported outside of a container') - with self.ssh_lock: - try: - # This inner try is for if the SSH connection has died. - try: - _, stdout, stderr = self.ssh.exec_command(command) - # Try to reconnect and run again - except paramiko.ssh_exception.SSHException: - self._connectSSH() - _, stdout, stderr = self.ssh.exec_command(command) - - exit_code = stdout.channel.recv_exit_status() - result = ''.join(stdout.readlines()) - if exit_code != 0: - result += ''.join(stderr.readlines()) - except Exception as e: - raise RunHPC.CallHPCException(self, 'Failed to execute remote command', command) from e - return exit_code, result.rstrip() + return self.ssh_pool.apply(self._callSSH, (command,)) def getJobSlots(self, job): # Jobs only use one slot because they are ran externally diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 548cd0ce93fc..252d95342827 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -87,7 +87,7 @@ def _submitJob(self, job, job_data): # Set what we've ran for this job so that we can # potentially get the context in an error command_ran = qsub_command - if self.ssh: + if self.ssh_host: command_ran = f"ssh {self.ssh_host} '{qsub_command}'" job.getTester().setCommandRan(command_ran) From fda9d07814fa8bba43a801ab5907aae24ffb638f Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 15 May 2024 09:08:06 -0600 Subject: [PATCH 061/243] Get command escaping working, add more context to output --- python/TestHarness/schedulers/RunHPC.py | 70 ++++++++++++++++------ python/TestHarness/schedulers/RunPBS.py | 7 ++- python/TestHarness/schedulers/pbs_template | 6 +- 3 files changed, 59 insertions(+), 24 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 4e000533cf3a..aedae89fa356 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -187,15 +187,34 @@ class JobData: Helper struct for storing the information to generate a job """ def __init__(self): + # The command to be ran in the job self.command = None + # self.command but escaped so that it can be printed + self.command_printable = None + # The name of the job self.name = None + # The number of procs to run the job with self.num_procs = None + # The number of threads to run the job with self.num_threads = None + # The combined stdout+stderr output file self.output_file = None + # The additonal output files to be read (csv, exodus, etc) self.output_files = None - self.submission_file = None + # The path to the submission script + self.submission_script = None + # The walltime to run the job with self.walltime = None + @staticmethod + def escapeCommand(command: str) -> str: + """ + Escapes quotes and newlines in a command. + """ + if command: + return json.dumps(command.replace('\n', ' '))[1:-1] + return '' + def submitJob(self, job): """ Method for submitting an HPC job for the given Job. @@ -208,35 +227,48 @@ def submitJob(self, job): job_data = self.JobData() # The submission script we're going to write to - job_data.submission_file = self.getHPCJobSubmissionPath(job) + job_data.submission_script = self.getHPCJobSubmissionPath(job) # The combined stdout+stderr from the job job_data.output_file = self.getHPCJobOutputPath(job) # Clean these two files - for file in [job_data.submission_file, job_data.output_file]: + for file in [job_data.submission_script, job_data.output_file]: if os.path.exists(file): os.remove(file) - # Set up the command. We have special logic here for when we're using apptainer, - # where we need to put the MPI command outside of the apptainer call - full_command = '' - command = tester.getCommand(options) - mpi_command = self.parseMPICommand(command) - if mpi_command: - command = command.replace(mpi_command, '') - full_command += mpi_command - # Split out whitespace in the command and then use json dumps to - # escape quoted characters - command = json.dumps(command.replace('\n', ' ')) + # The escaped command to be ran + command = self.escapeCommand(tester.getCommand(options)) - # Wrap the command with apptainer if we're in a container, and also bind - # in the root directory that the test is contained in + # Special logic for when we're running with apptainer, in which case + # we need to manipulate the command like such + # Original command: + # New command: apptainer exec /path/to/image "" + # This is also the reason why we have to form job_data.command_printable; + # the extra quotes around need to be escaped. APPTAINER_CONTAINER = os.environ.get('APPTAINER_CONTAINER') if APPTAINER_CONTAINER: + # Separate out the MPI command + mpi_command = self.escapeCommand(self.parseMPICommand(command)) + # Remove the MPI command from the run command + command = command.replace(mpi_command, '') + + # Start with the mpiexec call + job_data.command = mpi_command + job_data.command_printable = mpi_command + + # The root filesystem path that we're in so that we can be sure to bind + # it into the container root_path = os.path.abspath(tester.getTestDir()).split(os.path.sep)[1] - full_command += f'apptainer exec -B /{root_path} {APPTAINER_CONTAINER} ' - full_command += command + # The apptainer command that will get sandwiched in the middle + apptainer_command = f'apptainer exec -B /{root_path} {APPTAINER_CONTAINER}' + apptainer_command = self.escapeCommand(apptainer_command) + # Append the apptainer command along with the command to be ran + job_data.command += f'{apptainer_command} "{command}"' + job_data.command_printable += f'{apptainer_command} \\"{command}\\"' + # Not in apptainer, so we can just use the escaped command as is + else: + job_data.command = command + job_data.command_printable = command - job_data.command = full_command job_data.name = self.getHPCJobName(job) job_data.num_procs = tester.getProcs(options) job_data.num_threads = tester.getThreads(options) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 252d95342827..1e2204014922 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -44,12 +44,13 @@ def _submitJob(self, job, job_data): 'WALLTIME': job_data.walltime, 'PROJECT': self.options.queue_project, 'OUTPUT': job_data.output_file, + 'SUBMISSION_SCRIPT': job_data.submission_script, 'PLACE': 'scatter', 'TEST_NAME': tester.getTestName(), 'SUBMITTED_HOSTNAME': socket.gethostname(), 'CWD': tester.getTestDir(), 'COMMAND': job_data.command, - 'ESCAPED_COMMAND': json.dumps(job_data.command), + 'COMMAND_PRINTABLE': job_data.command_printable, 'ENDING_COMMENT': self.getOutputEndingComment(), 'MOOSE_PYTHONPATH': moose_python, 'OUTPUT_FILES': job_data.output_files} @@ -68,12 +69,12 @@ def _submitJob(self, job, job_data): script = definition_template.render(**template_env) # Write the script - open(job_data.submission_file, 'w').write(script) + open(job_data.submission_script, 'w').write(script) # Submission command. Here we have a simple bash loop # that will try to wait for the file if it doesn't exist yet qsub_command = [f'cd {tester.getTestDir()}', - f'FILE="{job_data.submission_file}"', + f'FILE="{job_data.submission_script}"', 'for i in {1..40}', 'do if [ -e "$FILE" ]', 'then qsub $FILE', diff --git a/python/TestHarness/schedulers/pbs_template b/python/TestHarness/schedulers/pbs_template index 624e12b81f44..487bd9924fe1 100644 --- a/python/TestHarness/schedulers/pbs_template +++ b/python/TestHarness/schedulers/pbs_template @@ -22,12 +22,14 @@ set -e export PYTHONPATH={{ MOOSE_PYTHONPATH }}:${PYTHONPATH} # Print a useful header +echo "################################################################################" +echo "TestHarness RunPBS job on $(hostname) in job ${PBS_JOBID}" echo "Time: $(date)" echo "Test: {{ TEST_NAME }}" echo "Directory: {{ CWD }}" -echo "Command: {{ ESCAPED_COMMAND }}" -echo "Hostname: $(hostname)" +echo "Command: {{ COMMAND_PRINTABLE }}" echo "Submitted hostname: {{ SUBMITTED_HOSTNAME }}" +echo "Submission script: {{ SUBMISSION_SCRIPT }}" echo "Output: {{ OUTPUT }}" module list echo "################################################################################" From 6344c89e2bfd60974107aab26576255aa59c8781 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 15 May 2024 09:11:36 -0600 Subject: [PATCH 062/243] Simplify the MPI command logic --- python/TestHarness/schedulers/RunHPC.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index aedae89fa356..c7a65b24ad09 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -247,13 +247,17 @@ def submitJob(self, job): APPTAINER_CONTAINER = os.environ.get('APPTAINER_CONTAINER') if APPTAINER_CONTAINER: # Separate out the MPI command - mpi_command = self.escapeCommand(self.parseMPICommand(command)) - # Remove the MPI command from the run command - command = command.replace(mpi_command, '') - - # Start with the mpiexec call - job_data.command = mpi_command - job_data.command_printable = mpi_command + mpi_command = self.escapeCommand(self.parseMPICommand()) + # Add MPI command as the prefix and remove it from the base command + if mpi_command: + command_prefix = mpi_command + command = command.replace(mpi_command, '') + # No MPI command; nothing to do + else: + command_prefix = '' + + job_data.command = command_prefix + job_data.command_printable = command_prefix # The root filesystem path that we're in so that we can be sure to bind # it into the container From 2dccc4ca45c406832dc114833743c2c27329d40b Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 15 May 2024 09:22:11 -0600 Subject: [PATCH 063/243] Rename variable, fix missing callback --- python/TestHarness/schedulers/RunHPC.py | 10 +++++----- python/TestHarness/schedulers/RunPBS.py | 2 +- python/TestHarness/schedulers/pbs_template | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index c7a65b24ad09..73d5e6ffca6b 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -200,7 +200,7 @@ def __init__(self): # The combined stdout+stderr output file self.output_file = None # The additonal output files to be read (csv, exodus, etc) - self.output_files = None + self.additional_output_files = None # The path to the submission script self.submission_script = None # The walltime to run the job with @@ -247,7 +247,7 @@ def submitJob(self, job): APPTAINER_CONTAINER = os.environ.get('APPTAINER_CONTAINER') if APPTAINER_CONTAINER: # Separate out the MPI command - mpi_command = self.escapeCommand(self.parseMPICommand()) + mpi_command = self.escapeCommand(self.parseMPICommand(command)) # Add MPI command as the prefix and remove it from the base command if mpi_command: command_prefix = mpi_command @@ -281,10 +281,10 @@ def submitJob(self, job): # The output files that we're expected to generate so that the # HPC job can add a terminator for them so that we can verify # they are complete on the executing host - job_data.output_files = [] + additional_output = [] for file in tester.getOutputFiles(options): - job_data.output_files.append(f'"{os.path.join(tester.getTestDir(), file)}"') - job_data.output_files = ' '.join(job_data.output_files) + additional_output.append(f'"{os.path.join(tester.getTestDir(), file)}"') + job_data.additional_output_files = ' '.join(additional_output) # Let the derived class actually submit the job job_id = self._submitJob(job, job_data) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 1e2204014922..0c0166208f90 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -53,7 +53,7 @@ def _submitJob(self, job, job_data): 'COMMAND_PRINTABLE': job_data.command_printable, 'ENDING_COMMENT': self.getOutputEndingComment(), 'MOOSE_PYTHONPATH': moose_python, - 'OUTPUT_FILES': job_data.output_files} + 'ADDITIONAL_OUTPUT_FILES': job_data.additional_output_files} if self.options.queue_queue: template_env['QUEUE'] = options.queue_queue if self.options.queue_source_command: diff --git a/python/TestHarness/schedulers/pbs_template b/python/TestHarness/schedulers/pbs_template index 487bd9924fe1..b0fbea4cc245 100644 --- a/python/TestHarness/schedulers/pbs_template +++ b/python/TestHarness/schedulers/pbs_template @@ -57,8 +57,8 @@ echo "Completed TestHarness RunHPC test execution; exit code = $return_code" echo "################################################################################" # Append a terminator to all of the output files for file syncing across NFS -OUTPUT_FILES=({{ OUTPUT_FILES }}) -for file in ${OUTPUT_FILES[@]}; do +ADDITIONAL_OUTPUT_FILES=({{ ADDITIONAL_OUTPUT_FILES }}) +for file in ${ADDITIONAL_OUTPUT_FILES[@]}; do if [ ! -e "$file" ]; then continue fi From 48be36302074d6f45e3602592ce03be982d4b297 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 15 May 2024 10:20:23 -0600 Subject: [PATCH 064/243] Fix escape once and for all, add detailed test spec+name to template --- python/TestHarness/schedulers/RunHPC.py | 10 +++++++--- python/TestHarness/schedulers/RunPBS.py | 3 ++- python/TestHarness/schedulers/pbs_template | 2 +- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 73d5e6ffca6b..a96caad88327 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -8,7 +8,7 @@ #* https://www.gnu.org/licenses/lgpl-2.1.html from RunParallel import RunParallel -import threading, os, re, sys, datetime, json +import threading, os, re, sys, datetime, shlex import paramiko from multiprocessing.pool import ThreadPool from timeit import default_timer as clock @@ -209,10 +209,14 @@ def __init__(self): @staticmethod def escapeCommand(command: str) -> str: """ - Escapes quotes and newlines in a command. + Escapes a command for use as a bash command """ if command: - return json.dumps(command.replace('\n', ' '))[1:-1] + # escape for bash; [1:-1] removes start+end quotes + command = shlex.quote(command)[1:-1] + # remove newlines + command = command.replace('\n', ' ') + return command return '' def submitJob(self, job): diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 0c0166208f90..df5bc165c2bf 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -46,7 +46,8 @@ def _submitJob(self, job, job_data): 'OUTPUT': job_data.output_file, 'SUBMISSION_SCRIPT': job_data.submission_script, 'PLACE': 'scatter', - 'TEST_NAME': tester.getTestName(), + 'TEST_SPEC': tester.getSpecFile(), + 'TEST_NAME': tester.getTestNameShort(), 'SUBMITTED_HOSTNAME': socket.gethostname(), 'CWD': tester.getTestDir(), 'COMMAND': job_data.command, diff --git a/python/TestHarness/schedulers/pbs_template b/python/TestHarness/schedulers/pbs_template index b0fbea4cc245..7f0142fa0731 100644 --- a/python/TestHarness/schedulers/pbs_template +++ b/python/TestHarness/schedulers/pbs_template @@ -25,7 +25,7 @@ export PYTHONPATH={{ MOOSE_PYTHONPATH }}:${PYTHONPATH} echo "################################################################################" echo "TestHarness RunPBS job on $(hostname) in job ${PBS_JOBID}" echo "Time: $(date)" -echo "Test: {{ TEST_NAME }}" +echo "Test: {{ TEST_SPEC }}:{{ TEST_NAME }}" echo "Directory: {{ CWD }}" echo "Command: {{ COMMAND_PRINTABLE }}" echo "Submitted hostname: {{ SUBMITTED_HOSTNAME }}" From 81db8089e6d2f9decda6324f7282994bc21a8f61 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 15 May 2024 11:20:35 -0600 Subject: [PATCH 065/243] More escape fixing... --- python/TestHarness/schedulers/RunHPC.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index a96caad88327..a6d562e77553 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -8,7 +8,7 @@ #* https://www.gnu.org/licenses/lgpl-2.1.html from RunParallel import RunParallel -import threading, os, re, sys, datetime, shlex +import threading, os, re, sys, datetime, shlex, json import paramiko from multiprocessing.pool import ThreadPool from timeit import default_timer as clock @@ -212,9 +212,13 @@ def escapeCommand(command: str) -> str: Escapes a command for use as a bash command """ if command: - # escape for bash; [1:-1] removes start+end quotes + # For the following, [1:-1] removes the additional + # quotes that were added to wrap the command + # General escape for bash command = shlex.quote(command)[1:-1] - # remove newlines + # Escape ' and "" + command = json.dumps(command)[1:-1] + # Remove newlines command = command.replace('\n', ' ') return command return '' From 010a55a31b7631d64c6acd6026b15dc6272457e0 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 15 May 2024 11:21:03 -0600 Subject: [PATCH 066/243] Add printout about output finalization --- python/TestHarness/schedulers/pbs_template | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/pbs_template b/python/TestHarness/schedulers/pbs_template index 7f0142fa0731..f4c7635e726b 100644 --- a/python/TestHarness/schedulers/pbs_template +++ b/python/TestHarness/schedulers/pbs_template @@ -54,12 +54,12 @@ set -e # in the event that PBS doesn't get it to us correctly echo "################################################################################" echo "Completed TestHarness RunHPC test execution; exit code = $return_code" -echo "################################################################################" # Append a terminator to all of the output files for file syncing across NFS ADDITIONAL_OUTPUT_FILES=({{ ADDITIONAL_OUTPUT_FILES }}) for file in ${ADDITIONAL_OUTPUT_FILES[@]}; do if [ ! -e "$file" ]; then + echo "Failed to finalize output $file" continue fi @@ -70,8 +70,11 @@ for file in ${ADDITIONAL_OUTPUT_FILES[@]}; do else printf "\n{{ ENDING_COMMENT }}" >> $file; fi + echo "Finalized output $file" done +echo "################################################################################" + # Append a recognizable string at the end of the output. We look # for this string when parsing the output so that we can be sure # that we have obtained all of the output From 951cf1d49076787d104252d8844611f26853355a Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 15 May 2024 11:44:57 -0600 Subject: [PATCH 067/243] Properly specify output files for JSONDiff and SchemaDiff --- python/TestHarness/testers/JSONDiff.py | 7 +------ python/TestHarness/testers/SchemaDiff.py | 5 ++++- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/python/TestHarness/testers/JSONDiff.py b/python/TestHarness/testers/JSONDiff.py index 68d5a72ecb68..810bdfacd547 100644 --- a/python/TestHarness/testers/JSONDiff.py +++ b/python/TestHarness/testers/JSONDiff.py @@ -8,9 +8,8 @@ #* https://www.gnu.org/licenses/lgpl-2.1.html from SchemaDiff import SchemaDiff -from TestHarness import util -class JSONDiff(SchemaDiff): +class JSONDiff(SchemaDiff): @staticmethod def validParams(): params = SchemaDiff.validParams() @@ -45,10 +44,6 @@ def __init__(self, name, params): re_entry += f"\['{key}'\]" self.exclude_regex_paths.append(re_entry) - def prepare(self, options): - if self.specs['delete_output_before_running'] == True: - util.deleteFilesAndFolders(self.getTestDir(), self.specs['jsondiff']) - def load_file(self, path1): import json with open(path1,"r") as f: diff --git a/python/TestHarness/testers/SchemaDiff.py b/python/TestHarness/testers/SchemaDiff.py index 6e7ba4eef30a..8bd5524b2e8b 100644 --- a/python/TestHarness/testers/SchemaDiff.py +++ b/python/TestHarness/testers/SchemaDiff.py @@ -36,9 +36,12 @@ def __init__(self, name, params): # So that derived classes can internally pass skip regex paths self.exclude_regex_paths = [] + def getOutputFiles(self, options): + return self.specs['schemadiff'] + def prepare(self, options): if self.specs['delete_output_before_running'] == True: - util.deleteFilesAndFolders(self.getTestDir(), self.specs['schemadiff']) + util.deleteFilesAndFolders(self.getTestDir(), self.getOutputFiles(options)) def processResults(self, moose_dir, options, output): output += self.testFileOutput(moose_dir, options, output) From 9a7e1d91f00a364809f2b248d3885704c9e43c44 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 15 May 2024 11:46:43 -0600 Subject: [PATCH 068/243] Properly skip tests for HPC --- test/tests/restart/advanced_stateful_material/tests | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/tests/restart/advanced_stateful_material/tests b/test/tests/restart/advanced_stateful_material/tests index 4722f382eab4..ca4a8433d6ca 100644 --- a/test/tests/restart/advanced_stateful_material/tests +++ b/test/tests/restart/advanced_stateful_material/tests @@ -6,6 +6,10 @@ type = RunApp input = advanced_stateful_material.i requirement = 'The system shall be able to generate a checkpoint of stateful material properties that supports the advanced restart of the properties' + # RunException with some cases will only run in serial, which means if + # "checkpoint" runs in parallel we'll actually get an error with a + # processor mismatch instead of the one that we want + hpc = false [] [errors] @@ -19,10 +23,6 @@ expect_err = 'The stateful material properties in RestartStatefulMaterial "test" that are being restarted do not match the stored properties in the same material object from the checkpoint.' prereq = checkpoint detail = 'a stateful property for a single material object is removed' - # RunException with some cases will only run in serial, which means if - # "checkpoint" runs in parallel we'll actually get an error with a - # processor mismatch instead of the one that we want - hpc = false [] [add_prop] type = RunException From 1db09ad00e58eb3e8137f1762563e658aeccb067 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 15 May 2024 13:40:38 -0600 Subject: [PATCH 069/243] Yet another attempt at fixing escapes --- python/TestHarness/schedulers/RunHPC.py | 40 ++++++++-------------- python/TestHarness/schedulers/pbs_template | 12 +++---- 2 files changed, 20 insertions(+), 32 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index a6d562e77553..26c965c9f209 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -206,23 +206,6 @@ def __init__(self): # The walltime to run the job with self.walltime = None - @staticmethod - def escapeCommand(command: str) -> str: - """ - Escapes a command for use as a bash command - """ - if command: - # For the following, [1:-1] removes the additional - # quotes that were added to wrap the command - # General escape for bash - command = shlex.quote(command)[1:-1] - # Escape ' and "" - command = json.dumps(command)[1:-1] - # Remove newlines - command = command.replace('\n', ' ') - return command - return '' - def submitJob(self, job): """ Method for submitting an HPC job for the given Job. @@ -243,19 +226,25 @@ def submitJob(self, job): if os.path.exists(file): os.remove(file) - # The escaped command to be ran - command = self.escapeCommand(tester.getCommand(options)) + # The command to be ran. We're going to wrap this command in single quotes + # so that we don't bash evaluate anything, hence the replacement of a + # single quote. Yes, this truly is madness. But it looks like it works. + # Pro tip: don't ever have to run things in bash with complex syntax + # that is quite bash like. + command = tester.getCommand(options) + command = command.replace('\n', ' ') + command = command.replace("'", "\'\\'\'") # Special logic for when we're running with apptainer, in which case # we need to manipulate the command like such # Original command: - # New command: apptainer exec /path/to/image "" + # New command: apptainer exec /path/to/image '' # This is also the reason why we have to form job_data.command_printable; # the extra quotes around need to be escaped. APPTAINER_CONTAINER = os.environ.get('APPTAINER_CONTAINER') if APPTAINER_CONTAINER: # Separate out the MPI command - mpi_command = self.escapeCommand(self.parseMPICommand(command)) + mpi_command = self.parseMPICommand(command) # Add MPI command as the prefix and remove it from the base command if mpi_command: command_prefix = mpi_command @@ -272,14 +261,13 @@ def submitJob(self, job): root_path = os.path.abspath(tester.getTestDir()).split(os.path.sep)[1] # The apptainer command that will get sandwiched in the middle apptainer_command = f'apptainer exec -B /{root_path} {APPTAINER_CONTAINER}' - apptainer_command = self.escapeCommand(apptainer_command) # Append the apptainer command along with the command to be ran - job_data.command += f'{apptainer_command} "{command}"' - job_data.command_printable += f'{apptainer_command} \\"{command}\\"' + job_data.command += f"{apptainer_command} '{command}'" + job_data.command_printable += f"{apptainer_command} \'\\'\'{command}\'\\'\'" # Not in apptainer, so we can just use the escaped command as is else: - job_data.command = command - job_data.command_printable = command + job_data.command = f"'{command}'" + job_data.command_printable += f"\'\\'\'{command}\'\\'\'" job_data.name = self.getHPCJobName(job) job_data.num_procs = tester.getProcs(options) diff --git a/python/TestHarness/schedulers/pbs_template b/python/TestHarness/schedulers/pbs_template index f4c7635e726b..382726cbc4d2 100644 --- a/python/TestHarness/schedulers/pbs_template +++ b/python/TestHarness/schedulers/pbs_template @@ -25,12 +25,12 @@ export PYTHONPATH={{ MOOSE_PYTHONPATH }}:${PYTHONPATH} echo "################################################################################" echo "TestHarness RunPBS job on $(hostname) in job ${PBS_JOBID}" echo "Time: $(date)" -echo "Test: {{ TEST_SPEC }}:{{ TEST_NAME }}" -echo "Directory: {{ CWD }}" -echo "Command: {{ COMMAND_PRINTABLE }}" -echo "Submitted hostname: {{ SUBMITTED_HOSTNAME }}" -echo "Submission script: {{ SUBMISSION_SCRIPT }}" -echo "Output: {{ OUTPUT }}" +echo 'Test: {{ TEST_SPEC }}:{{ TEST_NAME }}' +echo 'Directory: {{ CWD }}' +echo 'Command: {{ COMMAND_PRINTABLE }}' +echo 'Submitted hostname: {{ SUBMITTED_HOSTNAME }}' +echo 'Submission script: {{ SUBMISSION_SCRIPT }}' +echo 'Output: {{ OUTPUT }}' module list echo "################################################################################" echo "Beginning TestHarness RunHPC test execution" From edaed58be3e81f708cebd02a6e16001374f5076c Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 15 May 2024 13:40:53 -0600 Subject: [PATCH 070/243] Remove unused includes --- python/TestHarness/schedulers/RunHPC.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 26c965c9f209..069f3cbd82a3 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -8,7 +8,7 @@ #* https://www.gnu.org/licenses/lgpl-2.1.html from RunParallel import RunParallel -import threading, os, re, sys, datetime, shlex, json +import threading, os, re, sys, datetime import paramiko from multiprocessing.pool import ThreadPool from timeit import default_timer as clock From 1538189ef46d98571e71ade346ecfbdc7068e045 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 15 May 2024 14:26:47 -0600 Subject: [PATCH 071/243] Append to all names for the object --- python/TestHarness/TestHarness.py | 2 +- python/TestHarness/testers/Tester.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 3eca6d717041..05bdaad6c8e0 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -574,8 +574,8 @@ def appendRecoverableTests(self, testers): part2 = copy.deepcopy(part1) # Part 1: + part1.appendTestName('_part1') part1_params = part1.parameters() - part1_params['test_name'] += '_part1' part1_params['cli_args'].append('--test-checkpoint-half-transient') if self.options.recoversuffix == 'cpa': part1_params['cli_args'].append('Outputs/out/type=Checkpoint') diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index 6f492858b437..1f8e6b8e1691 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -263,6 +263,15 @@ def getTestNameShort(self): """ return test short name (not including the path) """ return self.specs['test_name_short'] + def appendTestName(self, value): + """ + Appends a value to the test name. + + Used when creating duplicate Testers for recover tests. + """ + self.specs['test_name'] += value + self.specs['test_name_short'] += value + def getPrereqs(self): """ return list of prerequisite tests this test depends on """ return self.specs['prereq'] From 71b72ed88b4631564cff60aa9e8df1a690ec4358 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 15 May 2024 15:13:14 -0600 Subject: [PATCH 072/243] Define prereqs based on the short test name --- python/TestHarness/JobDAG.py | 4 ++-- python/TestHarness/TestHarness.py | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/python/TestHarness/JobDAG.py b/python/TestHarness/JobDAG.py index 027f70defe58..58ac151e6a5d 100644 --- a/python/TestHarness/JobDAG.py +++ b/python/TestHarness/JobDAG.py @@ -52,7 +52,7 @@ def createJobs(self, testers): self.__name_to_job = {} for tester in testers: job = Job(tester, self, self.options) - name = job.getTestName() + name = job.getTestNameShort() if name not in self.__name_to_job: self.__name_to_job[name] = job else: @@ -155,7 +155,7 @@ def _doMakeDependencies(self): # test file has invalid prereq set except KeyError: - job.setStatus(job.error, 'unknown dependency') + job.setStatus(job.error, f'unknown dependency {prereq_job}') def _fix_cornercases(self, prereq_job, job): """ diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 05bdaad6c8e0..f2ff54764174 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -547,8 +547,6 @@ def augmentParameters(self, filename, tester, testroot_params={}): if type(params['prereq']) != list: print(("Option 'prereq' needs to be of type list in " + params['test_name'])) sys.exit(1) - elif (params['prereq'] != ['ALL']): - params['prereq'] = [relative_path.replace('/tests/', '') + '.' + item for item in params['prereq']] # Double the alloted time for tests when running with the valgrind option tester.setValgrindMode(self.options.valgrind_mode) @@ -584,7 +582,7 @@ def appendRecoverableTests(self, testers): # Part 2: part2_params = part2.parameters() - part2_params['prereq'].append(part1.parameters()['test_name']) + part2_params['prereq'].append(part1.getTestNameShort()) part2_params['delete_output_before_running'] = False part2_params['cli_args'].append('--recover --recoversuffix ' + self.options.recoversuffix) part2.addCaveats('recover') From 89dc7360fdf9a7dc5d67bf3232cf2e8d75908317 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 15 May 2024 15:13:54 -0600 Subject: [PATCH 073/243] Don't check skipped jobs for this test --- python/TestHarness/JobDAG.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/TestHarness/JobDAG.py b/python/TestHarness/JobDAG.py index 58ac151e6a5d..af2ab695719b 100644 --- a/python/TestHarness/JobDAG.py +++ b/python/TestHarness/JobDAG.py @@ -240,6 +240,8 @@ def _checkOutputCollisions(self): jobs = list(self.__job_dag.topological_sort()) # Sort by ID so we get it in the input file from top down jobs = sorted(jobs, key = lambda job: job.getID()) + # Don't check skipped jobs because their dependencies will have been removed + jobs = [job for job in jobs if not job.isSkip()] # Work down the file, starting with the second input and looking up for # collisions. By doing it in this order, we will error at the first occurance. @@ -255,8 +257,8 @@ def _checkOutputCollisions(self): other_files = set(other_tester.getOutputFiles(self.options)) conflicting_files = list(files.intersection(other_files)) if conflicting_files \ - and not self.__job_dag.is_dependency(other_job, job): - print(f'In {tester.getSpecFile()}:\n') + and not self.__job_dag.is_dependency(other_job, job) \ + and not self.__job_dag.is_dependency(job, other_job): print(' This test spec is set to run in parallel, but a race condition was found') print(' that could lead to multiple tests reading/writing from the same file.\n') print(f' Tests: {tester.getTestNameShort()}, {other_tester.getTestNameShort()}') From ece9aae1319e4a9ffa93e3b8d026af284d45fa1e Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 15 May 2024 15:28:08 -0600 Subject: [PATCH 074/243] Move around the dag setup to support checking collisions before removing edges --- python/TestHarness/JobDAG.py | 37 ++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/python/TestHarness/JobDAG.py b/python/TestHarness/JobDAG.py index af2ab695719b..0535f1172e23 100644 --- a/python/TestHarness/JobDAG.py +++ b/python/TestHarness/JobDAG.py @@ -107,11 +107,15 @@ def removeAllDependencies(self): def _checkDAG(self): """ perform some sanity checks on the current DAG """ if self.__job_dag.size(): + # Add edges based on prereqs + self._setupPrereqs() - self._doMakeDependencies() - self._doLast() + # Check for race conditions in output self._checkOutputCollisions() + # Remove edges for jobs that are skipped + self._doSkippedDependencies() + # If there are race conditions, then there may be more skipped jobs if self._doRaceConditions(): self._doSkippedDependencies() @@ -130,22 +134,17 @@ def _addEdge(self, child, parent): parent.setOutput('Cyclic dependency error!\n\t' + err_output) parent.setStatus(parent.error, 'Cyclic or Invalid Dependency Detected!') - def _doLast(self): - for job in self.__job_dag.topological_sort(): - if 'ALL' in job.getPrereqs(): - for a_job in self.__job_dag.topological_sort(): - if a_job != job and not a_job.isSkip(): - if '.ALL' in a_job.getTestName(): - a_job.setStatus(a_job.error, 'Test named ALL when "prereq = ALL" elsewhere in test spec file!') - self._addEdge(a_job, job) - self._doSkippedDependencies() - - def _doMakeDependencies(self): + def _setupPrereqs(self): """ Setup dependencies within the current Job DAG """ + # The jobs that have 'ALL' as a prereq + all_prereq_jobs = [] + + # Setup explicit dependencies (without 'ALL') for job in self.__job_dag.ind_nodes(): prereq_jobs = job.getPrereqs() if prereq_jobs == ['ALL']: - prereq_jobs = [] + all_prereq_jobs.append(job) + continue for prereq_job in prereq_jobs: try: self.__name_to_job[prereq_job] @@ -157,6 +156,14 @@ def _doMakeDependencies(self): except KeyError: job.setStatus(job.error, f'unknown dependency {prereq_job}') + # Setup dependencies for 'ALL' + for job in all_prereq_jobs: + for a_job in self.__job_dag.topological_sort(): + if a_job != job and not a_job.isSkip(): + if '.ALL' in a_job.getTestName(): + a_job.setStatus(a_job.error, 'Test named ALL when "prereq = ALL" elsewhere in test spec file!') + self._addEdge(a_job, job) + def _fix_cornercases(self, prereq_job, job): """ Fix skipped dependency when we have a heavy test depend on a not-heavy test @@ -240,8 +247,6 @@ def _checkOutputCollisions(self): jobs = list(self.__job_dag.topological_sort()) # Sort by ID so we get it in the input file from top down jobs = sorted(jobs, key = lambda job: job.getID()) - # Don't check skipped jobs because their dependencies will have been removed - jobs = [job for job in jobs if not job.isSkip()] # Work down the file, starting with the second input and looking up for # collisions. By doing it in this order, we will error at the first occurance. From 25bfbfd4e055518c38b4e37b99eac1cc34319269 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 15 May 2024 16:18:15 -0600 Subject: [PATCH 075/243] Skip tests on HPC where appropriate --- test/tests/misc/solution_invalid/tests | 4 ++++ test/tests/outputs/iterative/tests | 2 ++ test/tests/postprocessors/num_residual_eval/tests | 2 ++ 3 files changed, 8 insertions(+) diff --git a/test/tests/misc/solution_invalid/tests b/test/tests/misc/solution_invalid/tests index 0a8af98cf4ab..b20f3627604d 100644 --- a/test/tests/misc/solution_invalid/tests +++ b/test/tests/misc/solution_invalid/tests @@ -60,6 +60,7 @@ type = JSONDiff input = solution_invalid.i jsondiff = 'solution_invalid.json' + hpc = false # iteration counts can be MPI dependent design = 'SolutionInvalidity.md' issues = '#22814' requirement = 'The system shall be able to output detailed reasons and occurrences as to why a solution is invalid to file' @@ -72,6 +73,7 @@ input = solution_invalid.i cli_args = "Executioner/type=Transient Executioner/num_steps=1 Executioner/dtmin=1e-13 Executioner/error_on_dtmin=false Outputs/file_base='solution_invalid_transient' " jsondiff = 'solution_invalid_transient.json' + hpc = false # iteration counts can be MPI dependent design = 'SolutionInvalidity.md' issues = '#22814' requirement = 'The system shall be able to output detailed information about why a solution is invalid to a file in transient simulations' @@ -84,6 +86,7 @@ input = solution_invalid.i cli_args = "Materials/filter/test_different_procs=true Outputs/file_base='solution_invalid_parallel'" jsondiff = 'solution_invalid_parallel.json' + hpc = false # iteration counts can be MPI dependent design = 'SolutionInvalidity.md' min_parallel = 3 max_parallel = 3 @@ -113,6 +116,7 @@ type = JSONDiff input = solution_invalid_recover.i jsondiff = 'solution_invalid_checkpoint.json' + hpc = false # iteration counts can be MPI dependent cli_args = "Outputs/file_base='solution_invalid_checkpoint' Outputs/checkpoint=true" detail = 'outputting of checkpoint files' [] diff --git a/test/tests/outputs/iterative/tests b/test/tests/outputs/iterative/tests index b25cd2a457f8..a09ab47ac88e 100644 --- a/test/tests/outputs/iterative/tests +++ b/test/tests/outputs/iterative/tests @@ -78,6 +78,8 @@ recover = false petsc_version_release = true compiler = '!INTEL' + # MPI dependent + hpc = false requirement = "The system shall support output during linear and non-linear iterations during transient simulations using the CSV format." [../] diff --git a/test/tests/postprocessors/num_residual_eval/tests b/test/tests/postprocessors/num_residual_eval/tests index 99e6fb9b7172..a897304d744a 100644 --- a/test/tests/postprocessors/num_residual_eval/tests +++ b/test/tests/postprocessors/num_residual_eval/tests @@ -13,6 +13,8 @@ # This test requires a different number of residual evaluations in older PETScs # due to the changes in 3061bbd5d. petsc_version = '>=3.8.3' + # Difficult to have consistent solve behavior when running across nodes + hpc = false requirement = 'The system shall be capable of outputting the number of Residual evaluations' issues = '#2089' From 862786514c3bdbd86d1d741946fc4303e56ced0e Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 15 May 2024 16:47:14 -0600 Subject: [PATCH 076/243] Remove unused import --- python/TestHarness/testers/RunException.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/TestHarness/testers/RunException.py b/python/TestHarness/testers/RunException.py index 6783fe699667..2f4684b38816 100644 --- a/python/TestHarness/testers/RunException.py +++ b/python/TestHarness/testers/RunException.py @@ -9,7 +9,6 @@ from TestHarness import util from RunApp import RunApp -import os class RunException(RunApp): From 9bef851e6b3b3ac57b19c3591b302c3978a541fa Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 15 May 2024 16:54:58 -0600 Subject: [PATCH 077/243] Decrease polling times to 0.1s --- python/TestHarness/runners/HPCRunner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index da056cdac130..387952cd4e9d 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -39,7 +39,7 @@ def wait(self, timer): # This gets a structure that represents the job, and the # polling itself is only done on occasion within RunHPC while True: - time.sleep(1) + time.sleep(0.1) hpc_job = self.run_hpc.getHPCJob(self.job) # We're done @@ -79,7 +79,7 @@ def wait(self, timer): incomplete_files = set() # Wait for all of the files to be available - file_poll_interval = 0.25 + file_poll_interval = 0.1 waited_time = 0 while wait_files or incomplete_files: # Look for each file From 592abb579bb1a4832155c0979816e63948148538 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 15 May 2024 16:56:39 -0600 Subject: [PATCH 078/243] Set options in constructor --- python/TestHarness/runners/HPCRunner.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index 387952cd4e9d..a5033bc87906 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -27,6 +27,12 @@ def __init__(self, job, options, run_hpc): # just hang forever self.wait_output_time = 120 + # Interval in seconds for polling for job status + self.job_status_poll_time = 0.1 + + # Interval in seconds for polling for file completion + self.file_completion_poll_time = 0.1 + def spawn(self, timer): self.run_hpc.submitJob(self.job) timer.start() @@ -39,7 +45,7 @@ def wait(self, timer): # This gets a structure that represents the job, and the # polling itself is only done on occasion within RunHPC while True: - time.sleep(0.1) + time.sleep(self.job_status_poll_time) hpc_job = self.run_hpc.getHPCJob(self.job) # We're done @@ -79,7 +85,6 @@ def wait(self, timer): incomplete_files = set() # Wait for all of the files to be available - file_poll_interval = 0.1 waited_time = 0 while wait_files or incomplete_files: # Look for each file @@ -112,8 +117,8 @@ def print_files(files, type): print_files(incomplete_files, 'Incomplete') break - waited_time += file_poll_interval - time.sleep(file_poll_interval) + waited_time += self.file_completion_poll_time + time.sleep(self.file_completion_poll_time) def kill(self): self.run_hpc.killJob(self.job) From e8682e33775f044a65ad46d53397897a9a7b2793 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 15 May 2024 17:12:15 -0600 Subject: [PATCH 079/243] Add comments, manage when we failed to read the output --- python/TestHarness/runners/HPCRunner.py | 37 +++++++++++++++++++++---- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index a5033bc87906..a4ecbcd49c1e 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -34,7 +34,9 @@ def __init__(self, job, options, run_hpc): self.file_completion_poll_time = 0.1 def spawn(self, timer): + # Rely on the RunHPC object to submit the job self.run_hpc.submitJob(self.job) + timer.start() def wait(self, timer): @@ -98,7 +100,9 @@ def wait(self, timer): if self.fileIsReady(file): # Store the output if file == output_file: - self.trySetOutput(throw=True) + # It's now required because its complete + if not self.trySetOutput(required=True): + break # Done with this file incomplete_files.discard(file) @@ -123,16 +127,33 @@ def print_files(files, type): def kill(self): self.run_hpc.killJob(self.job) - def trySetOutput(self, throw=False): + def trySetOutput(self, required=False): + """ + Tries to set the output if it exists. + + If required is set, this will fail the job. + + Returns whether or not the output was set. + """ + # self.output is originally set to None so that other objects + # cannot attempt to write to it before we have at least obtained + # some object, hence why we need to set it here because we're + # signaling that we're ready for output if self.output is None: self.output = '' + # Whether or not we actually set it + did_set = False + + # Only do something if the output actually exists output_file = self.run_hpc.getHPCJobOutputPath(self.job) if os.path.exists(output_file) and os.path.isfile(output_file): try: - # If we're trying to read output, we can't truncate it + # If we're trying to parse output, we can't truncate it + # because it might appear in the truncated portion if self.job.getTester().needFullOutput(self.options): self.output = open(output_file, 'r').read() + # Not parsing the output, so just read it truncated else: self.output = self.readTruncated(output_file) @@ -141,11 +162,17 @@ def trySetOutput(self, throw=False): find_exit_code = re.search('Completed TestHarness RunHPC test execution; exit code = (\d+)', self.output) if find_exit_code: self.exit_code = int(find_exit_code.group(1)) + + did_set = True except: - if throw: - raise pass + # If required and we didn't load it, mark this error + if required and not did_set: + self.job.setStatus(self.job.error, 'FAILED OUTPUT READ') + + return did_set + def fileIsReady(self, file): """ Checks if a file is ready for reading. From d479534e275410ddedaaee3eec30ec361ecd7182 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 16 May 2024 07:34:21 -0600 Subject: [PATCH 080/243] Rename PBS and queue opts to HPC, remove separate results for HPC --- python/TestHarness/TestHarness.py | 36 +++++++------------ python/TestHarness/schedulers/Job.py | 3 +- python/TestHarness/schedulers/RunHPC.py | 20 +++++++---- python/TestHarness/schedulers/RunPBS.py | 10 +++--- python/TestHarness/schedulers/Scheduler.py | 4 +-- python/TestHarness/testers/AnalyzeJacobian.py | 6 ++-- python/TestHarness/testers/CheckFiles.py | 4 +-- python/TestHarness/testers/PythonUnitTest.py | 2 +- python/TestHarness/testers/RunApp.py | 2 +- python/TestHarness/testers/RunException.py | 6 ++-- python/TestHarness/testers/SignalTester.py | 6 ++-- python/TestHarness/testers/Tester.py | 5 ++- 12 files changed, 49 insertions(+), 55 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index f2ff54764174..8e9238e54fb7 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -794,8 +794,7 @@ def writeResults(self): self.options.results_storage['INPUT_FILE_NAME'] = self.options.input_file_name # Record that we are using --sep-files* options - self.options.results_storage['SEP_FILES'] = (True if self.options.pbs else False - or self.options.ok_files + self.options.results_storage['SEP_FILES'] = (self.options.ok_files or self.options.fail_files or self.options.sep_files) @@ -811,7 +810,7 @@ def writeResults(self): self.options.results_storage[job.getTestDir()] = self.options.results_storage.get(job.getTestDir(), {}) # If output has been stored in separate files, don't make additional copies by - # storing that data in this json results file (--pbs || --sep-files, etc options). + # storing that data in this json results file (--sep-files, etc options). output = '' if job.getOutputFile() else job.getOutput() self.options.results_storage[job.getTestDir()][job.getTestName()] = {'NAME' : job.getTestNameShort(), @@ -900,7 +899,7 @@ def initialize(self, argv, app_name): plugin_paths = [os.path.join(self.moose_dir, 'python', 'TestHarness'), os.path.join(self.moose_dir, 'share', 'moose', 'python', 'TestHarness')] self.factory.loadPlugins(plugin_paths, 'schedulers', "IS_SCHEDULER") - if self.options.pbs: + if self.options.hpc == 'pbs': scheduler_plugin = 'RunPBS' # The default scheduler plugin else: @@ -973,7 +972,7 @@ def initialize(self, argv, app_name): def useExistingStorage(self): """ reasons for returning bool if we should use a previous results_storage file """ if (os.path.exists(self.options.results_file) - and (self.options.failed_tests or self.options.pbs or self.options.show_last_run)): + and (self.options.failed_tests or self.options.show_last_run)): return True elif ((self.options.failed_tests or self.options.show_last_run) and not os.path.exists(self.options.results_file)): @@ -1058,12 +1057,13 @@ def parseCLArgs(self, argv): outputgroup.add_argument("--results-file", nargs=1, default=self.results_file, help="Save run_tests results to an alternative json file (default: %(default)s)") outputgroup.add_argument("--show-last-run", action="store_true", dest="show_last_run", help="Display previous results without executing tests again") - queuegroup = parser.add_argument_group('Queue Options', 'Options controlling which queue manager to use') - queuegroup.add_argument('--pbs', action='store_true', dest='pbs', help='Launch tests using PBS as your scheduler') - queuegroup.add_argument('--pbs-project', nargs=1, action='store', dest='queue_project', type=str, default='moose', metavar='', help='Identify your job(s) with this project (default: %(default)s)') - queuegroup.add_argument('--pbs-queue', nargs=1, action='store', dest='queue_queue', type=str, metavar='', help='Submit jobs to the specified queue') - queuegroup.add_argument('--hpc-host', nargs=1, action='store', dest='queue_host', metavar='', help='The host to use for submitting HPC jobs') - queuegroup.add_argument('--hpc-pre-source', nargs=1, action="store", dest='queue_source_command', metavar='', help='Source specified file before launching HPC tests') + # Options for HPC execution + hpcgroup = parser.add_argument_group('HPC Options', 'Options controlling HPC execution') + hpcgroup.add_argument('--hpc', dest='hpc', choices=['pbs'], help='Launch tests using a HPC scheduler') + hpcgroup.add_argument('--hpc-host', nargs=1, action='store', dest='hpc_host', metavar='', help='The host to use for submitting HPC jobs') + hpcgroup.add_argument('--hpc-pre-source', nargs=1, action="store", dest='hpc_pre_source', metavar='', help='Source specified file before launching HPC tests') + hpcgroup.add_argument('--pbs-project', nargs=1, action='store', dest='pbs_project', type=str, default='moose', metavar='', help='Identify your job(s) with this project (default: %(default)s)') + hpcgroup.add_argument('--pbs-queue', nargs=1, action='store', dest='hpc_queue', type=str, metavar='', help='Submit jobs to the specified queue') code = True if self.code.decode() in argv: @@ -1102,21 +1102,9 @@ def checkAndUpdateCLArgs(self): if opts.spec_file and not os.path.exists(opts.spec_file): print('ERROR: --spec-file supplied but path does not exist') sys.exit(1) - if opts.queue_source_command and not os.path.exists(opts.queue_source_command): - print('ERROR: pre-source supplied but path does not exist') - sys.exit(1) - if opts.failed_tests and not opts.pbs and not os.path.exists(opts.results_file): + if opts.failed_tests and not os.path.exists(opts.results_file): print('ERROR: --failed-tests could not detect a previous run') sys.exit(1) - if opts.pbs and opts.pedantic_checks: - print('ERROR: --pbs and --pedantic-checks cannot be used simultaneously') - sys.exit(1) - if opts.pbs and opts.jobs: - print('ERROR: --pbs and -j|--jobs cannot be used simultaneously') - sys.exit(1) - if opts.pbs and opts.extra_info: - print('ERROR: --pbs and -e (extra info) cannot be used simultaneously') - sys.exit(1) if opts.verbose and opts.quiet: print('Do not be an oxymoron with --verbose and --quiet') sys.exit(1) diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index c14843a745ff..7a8e50abe56e 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -275,8 +275,7 @@ def getOutput(self): def getOutputFile(self): """ Return the output file path """ - if ((self.options.pbs - or self.options.ok_files + if ((self.options.ok_files or self.options.fail_files or self.options.sep_files) and (self.isPass() or self.isFail())): diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 069f3cbd82a3..d08bdacff6a8 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -41,7 +41,7 @@ def __init__(self, harness, params): self.hpc_jobs = {} # The jump hostname for running commands, if any - self.ssh_host = self.options.queue_host + self.ssh_host = self.options.hpc_host # The SSH key to use for connections self.ssh_key_filename = None # The pool of processes for running threaded SSH comments @@ -77,19 +77,25 @@ def __init__(self, harness, params): if not self.ssh_host: print('ERROR: --hpc-host must be set when using HPC jobs within apptainer') sys.exit(1) - if not self.options.queue_source_command: + if not self.options.hpc_pre_source: default_pre_source = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'hpc_source') - self.options.queue_source_command = default_pre_source + self.options.hpc_pre_source = default_pre_source print(f'INFO: Setting --hpc-pre-source={default_pre_source}') - if self.options.queue_source_command and not os.path.exists(self.options.queue_source_command): - print(f'ERROR: --hpc-pre-source path {self.options.queue_source_command} does not exist') + if self.options.hpc_pre_source and not os.path.exists(self.options.hpc_pre_source): + print(f'ERROR: --hpc-pre-source path {self.options.hpc_pre_source} does not exist') + sys.exit(1) + if self.options.hpc and self.options.pedantic_checks: + print('ERROR: --hpc and --pedantic-checks cannot be used simultaneously') + sys.exit(1) + if self.options.hpc and self.options.jobs: + print('ERROR: --hpc and -j|--jobs cannot be used simultaneously') sys.exit(1) # Load the pre-source if it exists self.source_contents = None - if self.options.queue_source_command: - self.source_contents = open(self.options.queue_source_command, 'r').read() + if self.options.hpc_pre_source: + self.source_contents = open(self.options.hpc_pre_source, 'r').read() class HPCJob: """ diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index df5bc165c2bf..3604c50a4436 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -42,7 +42,7 @@ def _submitJob(self, job, job_data): template_env = {'NAME': job_data.name, 'SELECT': f'{job_data.num_procs}:mpiprocs=1:ncpus={job_data.num_threads}', 'WALLTIME': job_data.walltime, - 'PROJECT': self.options.queue_project, + 'PROJECT': self.options.pbs_project, 'OUTPUT': job_data.output_file, 'SUBMISSION_SCRIPT': job_data.submission_script, 'PLACE': 'scatter', @@ -55,10 +55,10 @@ def _submitJob(self, job, job_data): 'ENDING_COMMENT': self.getOutputEndingComment(), 'MOOSE_PYTHONPATH': moose_python, 'ADDITIONAL_OUTPUT_FILES': job_data.additional_output_files} - if self.options.queue_queue: - template_env['QUEUE'] = options.queue_queue - if self.options.queue_source_command: - template_env['SOURCE_FILE'] = options.queue_source_command + if self.options.hpc_queue: + template_env['QUEUE'] = options.hpc_queue + if self.options.hpc_pre_source: + template_env['SOURCE_FILE'] = options.hpc_pre_source if self.source_contents: template_env['SOURCE_CONTENTS'] = self.source_contents diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index e0a829bc71ae..a064f89f9040 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -164,7 +164,7 @@ def maxFailures(self): """ Boolean for hitting max failures """ return ((self.options.valgrind_mode and self.__failures >= self.options.valgrind_max_fails) or self.__failures >= self.options.max_fails - and not self.options.pbs) + and not self.options.hpc) def run(self, job): """ Call derived run method """ @@ -399,7 +399,7 @@ def jobStatus(self, job): if (not self.options.sep_files and not self.options.ok_files and not self.options.fail_files - and not self.options.pbs + and not self.options.hpc and not self.options.heavy_tests and not self.options.valgrind_mode): job.addCaveats('FINISHED') diff --git a/python/TestHarness/testers/AnalyzeJacobian.py b/python/TestHarness/testers/AnalyzeJacobian.py index da4544937a17..0cfe84b5a66f 100644 --- a/python/TestHarness/testers/AnalyzeJacobian.py +++ b/python/TestHarness/testers/AnalyzeJacobian.py @@ -90,9 +90,9 @@ def processResults(self, moose_dir, options, output): return output def checkRunnable(self, options): - # We cannot rely on an external script running things with PBS - if options.pbs: - self.addCaveats('PBS NOT SUPPORTED') + # We cannot rely on an external script running things within HPC + if options.hpc: + self.addCaveats('hpc unsupported') self.setStatus(self.skip) return False diff --git a/python/TestHarness/testers/CheckFiles.py b/python/TestHarness/testers/CheckFiles.py index 070957dfa7c7..df9f1da06c69 100644 --- a/python/TestHarness/testers/CheckFiles.py +++ b/python/TestHarness/testers/CheckFiles.py @@ -78,8 +78,8 @@ def processResults(self, moose_dir, options, output): def checkRunnable(self, options): # We cannot reliably check if files do not exist with a networked file system - if options.pbs and self.specs['check_not_exists']: - self.addCaveats('PBS NOT SUPPORTED') + if options.hpc and self.specs['check_not_exists']: + self.addCaveats('hpc unsupported') self.setStatus(self.skip) return False diff --git a/python/TestHarness/testers/PythonUnitTest.py b/python/TestHarness/testers/PythonUnitTest.py index 7bb888ac2f44..82492de131fb 100644 --- a/python/TestHarness/testers/PythonUnitTest.py +++ b/python/TestHarness/testers/PythonUnitTest.py @@ -63,7 +63,7 @@ def getProcs(self, options): # If we start within a script within apptainer and then call mpiexec on HPC, # it will not work because the mpiexec call needs to be outside of the apptainer # call. So, limit these tests to 1 proc - if options.pbs and \ + if options.hpc and \ os.environ.get('APPTAINER_CONTAINER') and \ int(self.specs['min_parallel']) == 1 and procs != 1: self.addCaveats('hpc apptainer max_cpus=1') diff --git a/python/TestHarness/testers/RunApp.py b/python/TestHarness/testers/RunApp.py index fe2671aedfb0..c2544cdbe263 100644 --- a/python/TestHarness/testers/RunApp.py +++ b/python/TestHarness/testers/RunApp.py @@ -120,7 +120,7 @@ def checkRunnable(self, options): return False if options.pbs and self.specs.isValid('command_proxy') and os.environ.get('APPTAINER_CONTAINER') is not None: - self.addCaveats('PBS NOT SUPPORTED') + self.addCaveats('hpc unsupported') self.setStatus(self.skip) return False diff --git a/python/TestHarness/testers/RunException.py b/python/TestHarness/testers/RunException.py index 2f4684b38816..2855489cd422 100644 --- a/python/TestHarness/testers/RunException.py +++ b/python/TestHarness/testers/RunException.py @@ -38,7 +38,7 @@ def checkRunnable(self, options): # We seem to have issues with --redirect-output causing # "Inappropriate ioctl for device (25)" errors, so if this test # requires more procs, we can't run it - if options.pbs and int(self.specs['min_parallel'] > 1): + if options.hpc and int(self.specs['min_parallel'] > 1): self.addCaveats('PBS max_cpus=1') return False return RunApp.checkRunnable(self, options) @@ -58,8 +58,8 @@ def getProcs(self, options): # We seem to have issues with --redirect-output causing # "Inappropriate ioctl for device (25)" errors, so if this test # doesn't require more procs, just set it to zero - if options.pbs and int(self.specs['min_parallel']) == 1 and procs != 1: - self.addCaveats('PBS max_cpus=1') + if options.hpc and int(self.specs['min_parallel']) == 1 and procs != 1: + self.addCaveats('hpc max_cpus=1') return 1 return procs diff --git a/python/TestHarness/testers/SignalTester.py b/python/TestHarness/testers/SignalTester.py index bcbbd97f37ef..4338b22ebd15 100644 --- a/python/TestHarness/testers/SignalTester.py +++ b/python/TestHarness/testers/SignalTester.py @@ -39,8 +39,10 @@ def __init__(self, name, params): raise e def checkRunnable(self, options): - if options.pbs: - self.addCaveats('PBS NOT SUPPORTED') + # We could probably configure sending signals via pbs and slurm + # but for now that's a no + if options.hpc: + self.addCaveats('hpc unsupported') self.setStatus(self.skip) return False diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index 1f8e6b8e1691..0fd11c3713aa 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -765,14 +765,13 @@ def checkRunnableBase(self, options): self.setStatus(self.fail, 'ABSOLUTE PATH DETECTED') # We can't offer the option of reading output files outside of initial TestDir - if self.specs['working_directory'] and (options.pbs - or options.ok_files + if self.specs['working_directory'] and (options.ok_files or options.fail_files or options.sep_files): reasons['working_directory'] = '--sep-files* enabled' # Explicitly skip HPC tests - if not self.specs['hpc'] and options.pbs: + if not self.specs['hpc'] and options.hpc: reasons['hpc'] = 'hpc=false' ##### The below must be performed last to register all above caveats ##### From db6b74a2eb058fa1ff5e3f0412b7827b050f8a0e Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 16 May 2024 07:39:07 -0600 Subject: [PATCH 081/243] Make file timeout a command line option --- python/TestHarness/TestHarness.py | 3 ++- python/TestHarness/runners/HPCRunner.py | 9 +-------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 8e9238e54fb7..8b082f2c2859 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -1059,9 +1059,10 @@ def parseCLArgs(self, argv): # Options for HPC execution hpcgroup = parser.add_argument_group('HPC Options', 'Options controlling HPC execution') - hpcgroup.add_argument('--hpc', dest='hpc', choices=['pbs'], help='Launch tests using a HPC scheduler') + hpcgroup.add_argument('--hpc', dest='hpc', action='store', choices=['pbs'], help='Launch tests using a HPC scheduler') hpcgroup.add_argument('--hpc-host', nargs=1, action='store', dest='hpc_host', metavar='', help='The host to use for submitting HPC jobs') hpcgroup.add_argument('--hpc-pre-source', nargs=1, action="store", dest='hpc_pre_source', metavar='', help='Source specified file before launching HPC tests') + hpcgroup.add_argument('--hpc-file-timeout', nargs=1, type=int, action='store', dest='hpc_file_timeout', default=120, help='The time in seconds to wait for HPC output') hpcgroup.add_argument('--pbs-project', nargs=1, action='store', dest='pbs_project', type=str, default='moose', metavar='', help='Identify your job(s) with this project (default: %(default)s)') hpcgroup.add_argument('--pbs-queue', nargs=1, action='store', dest='hpc_queue', type=str, metavar='', help='Submit jobs to the specified queue') diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index a4ecbcd49c1e..ed73a2d70443 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -20,13 +20,6 @@ def __init__(self, job, options, run_hpc): # The RunHPC object self.run_hpc = run_hpc - # Number of seconds to try to wait for the output - # We don't want to wait forever for output because - # if the job ended in an unexpected state, it might - # not even be using the output and we don't want to - # just hang forever - self.wait_output_time = 120 - # Interval in seconds for polling for job status self.job_status_poll_time = 0.1 @@ -107,7 +100,7 @@ def wait(self, timer): incomplete_files.discard(file) # We've waited for files for too long - if (wait_files or incomplete_files) and waited_time >= self.wait_output_time: + if (wait_files or incomplete_files) and waited_time >= self.options.hpc_file_timeout: self.job.setStatus(self.job.error, 'FILE TIMEOUT') if self.output is None: self.trySetOutput() From b611f2bac6bbd8e8108f81f58f53f268ee70fd1b Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 16 May 2024 09:04:19 -0600 Subject: [PATCH 082/243] Setup useful PBS exit codes --- python/TestHarness/runners/HPCRunner.py | 3 +- python/TestHarness/schedulers/PBScodes.py | 58 +++++++++++------------ python/TestHarness/schedulers/RunPBS.py | 13 ++++- 3 files changed, 42 insertions(+), 32 deletions(-) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index ed73a2d70443..b228c26e86d1 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -56,7 +56,8 @@ def wait(self, timer): # If the Job is already finished, something happened in PBS # so we have an invalid state for processing in the Tester if self.job.isFinished(): - self.exit_code = -1 + if self.exit_code is None: + self.exit_code = -1 # If we have output, we should try to add it self.trySetOutput() diff --git a/python/TestHarness/schedulers/PBScodes.py b/python/TestHarness/schedulers/PBScodes.py index d90eead12369..e2a8e8b2195c 100644 --- a/python/TestHarness/schedulers/PBScodes.py +++ b/python/TestHarness/schedulers/PBScodes.py @@ -43,35 +43,35 @@ # These errors come from src/include/job.h for OpenPBS # Negative exit status indicates that the job could not be executed. -PBS_User_EXITCODES = { '0' : 'JOB_EXEC_OK:Job execution was successful', - '-1' : 'JOB_EXEC_FAIL1:Job execution failed, before files, no retry', - '-2' : 'JOB_EXEC_FAIL2:Job execution failed, after files, no retry', - '-3' : 'JOB_EXEC_RETRY:Job execution failed, do retry', - '-4' : 'JOB_EXEC_INITABT:Job aborted on MoM initialization', - '-5' : 'JOB_EXEC_INITRST:Job aborted on MoM initialization, checkpoint, no migrate', - '-6' : 'JOB_EXEC_INITRMG:Job aborted on MoM initialization, checkpoint, ok migrate', - '-7' : 'JOB_EXEC_BADRESRT:Job restart failed', - '-10' : 'JOB_EXEC_FAILUID:Invalid UID/GID for job', - '-11' : 'JOB_EXEC_RERUN:Job was rerun', - '-12' : 'JOB_EXEC_CHKP:Job was checkpointed and killed', - '-13' : 'JOB_EXEC_FAIL_PASSWORD:Job failed due to a bad password', - '-14' : 'JOB_EXEC_RERUN_ON_SIS_FAIL:Job was requeued (if rerunnable) or deleted (if not) due to a communication failure between the primary execution host MoM and a Sister', - '-15' : 'JOB_EXEC_QUERST:Requeue job for restart from checkpoint', - '-16' : 'JOB_EXEC_FAILHOOK_RERUN:Job execution failed due to hook rejection; requeue for later retry', - '-17' : 'JOB_EXEC_FAILHOOK_DELETE:Job execution failed due to hook rejection; delete the job at end', - '-18' : 'JOB_EXEC_HOOK_RERUN:A hook requested for job to be requeued', - '-19' : 'JOB_EXEC_HOOK_DELETE:A hook requested for job to be deleted', - '-20' : 'JOB_EXEC_RERUN_MS_FAIL:Job requeued because server could not contact the primary execution host MoM', - '-21' : 'JOB_EXEC_FAIL_SECURITY:Security breach in PBS directory', - '-22' : 'JOB_EXEC_HOOKERROR:Job exec failed due to unexpected exception or hook execution timed out', - '-23' : 'JOB_EXEC_FAIL_KRB5:Error no kerberos credentials supplied', - '-24' : 'JOB_EXEC_KILL_NCPUS_BURST:Job exec failed due to exceeding ncpus (burst)', - '-25' : 'JOB_EXEC_KILL_NCPUS_SUM:Job exec failed due to exceeding ncpus (sum)', - '-26' : 'JOB_EXEC_KILL_VMEM:Job exec failed due to exceeding vmem', - '-27' : 'JOB_EXEC_KILL_MEM:Job exec failed due to exceeding mem', - '-28' : 'JOB_EXEC_KILL_CPUT:Job exec failed due to exceeding cput', - '-29' : 'JOB_EXEC_KILL_WALLTIME:Job exec failed due to exceeding walltime', - '-30' : 'JOB_EXEC_JOINJOB:Job exec failed due to join job error' } +PBS_User_EXITCODES = { 0 : ('JOB_EXEC_OK', 'Job execution was successful'), + -1 : ('JOB_EXEC_FAIL1', 'Job execution failed, before files, no retry'), + -2 : ('JOB_EXEC_FAIL2', 'Job execution failed, after files, no retry'), + -3 : ('JOB_EXEC_RETRY', 'Job execution failed, do retry'), + -4 : ('JOB_EXEC_INITABT', 'Job aborted on MoM initialization'), + -5 : ('JOB_EXEC_INITRST', 'Job aborted on MoM initialization, checkpoint, no migrate'), + -6 : ('JOB_EXEC_INITRMG', 'Job aborted on MoM initialization, checkpoint, ok migrate'), + -7 : ('JOB_EXEC_BADRESRT', 'Job restart failed'), + -10 : ('JOB_EXEC_FAILUID', 'Invalid UID/GID for job'), + -11 : ('JOB_EXEC_RERUN', 'Job was rerun'), + -12 : ('JOB_EXEC_CHKP', 'Job was checkpointed and killed'), + -13 : ('JOB_EXEC_FAIL_PASSWORD', 'Job failed due to a bad password'), + -14 : ('JOB_EXEC_RERUN_ON_SIS_FAIL', 'Job was requeued (if rerunnable) or deleted (if not) due to a communication failure between the primary execution host MoM and a Sister'), + -15 : ('JOB_EXEC_QUERST', 'Requeue job for restart from checkpoint'), + -16 : ('JOB_EXEC_FAILHOOK_RERUN', 'Job execution failed due to hook rejection; requeue for later retry'), + -17 : ('JOB_EXEC_FAILHOOK_DELETE', 'Job execution failed due to hook rejection; delete the job at end'), + -18 : ('JOB_EXEC_HOOK_RERUN', 'A hook requested for job to be requeued'), + -19 : ('JOB_EXEC_HOOK_DELETE', 'A hook requested for job to be deleted'), + -20 : ('JOB_EXEC_RERUN_MS_FAIL', 'Job requeued because server could not contact the primary execution host MoM'), + -21 : ('JOB_EXEC_FAIL_SECURITY', 'Security breach in PBS directory'), + -22 : ('JOB_EXEC_HOOKERROR', 'Job exec failed due to unexpected exception or hook execution timed out'), + -23 : ('JOB_EXEC_FAIL_KRB5', 'Error no kerberos credentials supplied'), + -24 : ('JOB_EXEC_KILL_NCPUS_BURST', 'Job exec failed due to exceeding ncpus (burst)'), + -25 : ('JOB_EXEC_KILL_NCPUS_SUM', 'Job exec failed due to exceeding ncpus (sum)'), + -26 : ('JOB_EXEC_KILL_VMEM', 'Job exec failed due to exceeding vmem'), + -27 : ('JOB_EXEC_KILL_MEM', 'Job exec failed due to exceeding mem'), + -28 : ('JOB_EXEC_KILL_CPUT', 'Job exec failed due to exceeding cput'), + -29 : ('JOB_EXEC_KILL_WALLTIME', 'Job exec failed due to exceeding walltime'), + -30 : ('JOB_EXEC_JOINJOB', 'Job exec failed due to join job error') } PBS_STATUSES = { '0' : 'UNKNOWN', 'B' : 'BEGUN', diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 3604c50a4436..f4b4446a7ed5 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -10,7 +10,7 @@ import os, re, json, socket, time from RunParallel import RunParallel from RunHPC import RunHPC -from PBScodes import * +from PBScodes import PBS_User_EXITCODES import jinja2 ## This Class is responsible for maintaining an interface to the PBS scheduling syntax @@ -152,8 +152,17 @@ def updateJobs(self): pbs_job.exit_code = exit_code pbs_job.state = state + # Negative exit code, means PBS killed it for some reason + # Try to find it in our pbs exit code list to return something useful + if exit_code is not None and exit_code < 0: + name_reason_tup = PBS_User_EXITCODES.get(exit_code) + if name_reason_tup is not None: + name, _ = name_reason_tup + job.setStatus(job.error, f'PBS ERROR: {name}') + else: + terminated = True # Mark the job as terminated (past walltime, over resources, killed) - if terminated: + if terminated and not job.isFinished(): job.setStatus(job.error, 'PBS JOB TERMINATED') except Exception as e: raise self.CallHPCException(self, f'Failed to parse collective job status', cmd, result) from e From 8b0e3cc2f0ba7fd3e00ce4f9569ed6919eafbe63 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 16 May 2024 09:08:06 -0600 Subject: [PATCH 083/243] Generalize the state to running or not --- python/TestHarness/schedulers/RunHPC.py | 4 ++-- python/TestHarness/schedulers/RunPBS.py | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index d08bdacff6a8..471a369b06b5 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -114,8 +114,8 @@ def __init__(self, id, command): # Whether or not this job was killed; used so what we don't # bother killing a job multiple times self.killed = False - # The job state as defined by PBS - self.state = None + # Whether or not the job is currently running + self.running = False class CallHPCException(Exception): """ diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index f4b4446a7ed5..c907aa82dcb3 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -144,13 +144,17 @@ def updateJobs(self): done = exit_code is not None or terminated # Get the job state, and report running if it switched to running - if state == 'R' and pbs_job.state != 'R': + if state == 'R' and not pbs_job.running: + pbs_job.running = True self.setAndOutputJobStatus(job, job.running) + # If we were running but now we're done, we're not running anymore + if pbs_job.running and done: + pbs_job.running = False + # Update the PBSJob structure pbs_job.done = done pbs_job.exit_code = exit_code - pbs_job.state = state # Negative exit code, means PBS killed it for some reason # Try to find it in our pbs exit code list to return something useful From 482e6d98ba9fe86e0590630a35a4eb48b0d2f7cd Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 16 May 2024 10:29:57 -0600 Subject: [PATCH 084/243] Increase max time for default timed HPC jobs --- python/TestHarness/JobDAG.py | 17 +++++++++------ python/TestHarness/schedulers/RunHPC.py | 11 ++++++++++ python/TestHarness/schedulers/Scheduler.py | 2 +- python/TestHarness/testers/Tester.py | 25 ++++++++++++++++------ 4 files changed, 42 insertions(+), 13 deletions(-) diff --git a/python/TestHarness/JobDAG.py b/python/TestHarness/JobDAG.py index 0535f1172e23..17946d7e3f0b 100644 --- a/python/TestHarness/JobDAG.py +++ b/python/TestHarness/JobDAG.py @@ -99,7 +99,7 @@ def removeAllDependencies(self): """ Flatten current DAG so that it no longer contains any dependency information """ if self.__name_to_job and self.__job_dag.size(): tmp_job_dag = dag.DAG() - for job in self.__job_dag.topological_sort(): + for job in self.getJobs(): tmp_job_dag.add_node(job) self.__job_dag = tmp_job_dag return self.__job_dag @@ -158,7 +158,7 @@ def _setupPrereqs(self): # Setup dependencies for 'ALL' for job in all_prereq_jobs: - for a_job in self.__job_dag.topological_sort(): + for a_job in self.getJobs(): if a_job != job and not a_job.isSkip(): if '.ALL' in a_job.getTestName(): a_job.setStatus(a_job.error, 'Test named ALL when "prereq = ALL" elsewhere in test spec file!') @@ -211,7 +211,7 @@ def _doPreviouslyFailed(self, job): def _doSkippedDependencies(self): """ Determine which jobs in the DAG should be skipped """ - for job in list(self.__job_dag.topological_sort()): + for job in self.getJobs(): dep_jobs = set([]) if self.options.failed_tests: @@ -244,9 +244,8 @@ def _checkOutputCollisions(self): if not self.canParallel(): return - jobs = list(self.__job_dag.topological_sort()) # Sort by ID so we get it in the input file from top down - jobs = sorted(jobs, key = lambda job: job.getID()) + jobs = sorted(self.getJobs(), key = lambda job: job.getID()) # Work down the file, starting with the second input and looking up for # collisions. By doing it in this order, we will error at the first occurance. @@ -276,7 +275,7 @@ def _doRaceConditions(self): """ Check for race condition errors within in the DAG""" # Build output_file in relation to job dictionary output_to_job = {} - for job in self.__job_dag.topological_sort(): + for job in self.getJobs(): if job.getRunnable() and not job.isFinished(): for output_file in job.getOutputFiles(self.options): output_to_job[output_file] = output_to_job.get(output_file, []) @@ -325,6 +324,12 @@ def _printDownstreams(self, job): cyclic_path.append('%s -->'% (d_job.getTestNameShort())) return ' '.join(cyclic_path) + def getJobs(self): + """ + Returns the sorted jobs in the DAG + """ + return self.__job_dag.topological_sort() + def printDAG(self): """ Print the current structure of the DAG """ job_order = [] diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 471a369b06b5..b8ba7da3d631 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -340,6 +340,17 @@ def buildRunner(self, job, options): from TestHarness.runners.HPCRunner import HPCRunner return HPCRunner(job, options, self) + def augmentJobs(self, jobs): + super().augmentJobs(jobs) + + # If a job has its default time, double it. We grant a little more time + # to small jobs on HPC due to slower IO, etc + for job in jobs: + tester = job.getTester() + max_time = tester.getMaxTime() + if max_time == tester.getDefaultMaxTime(): + tester.setMaxTime(max_time * 2) + @staticmethod def getHPCJobName(job) -> str: """Gets the name of the HPC job given a tester diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index a064f89f9040..38652191888b 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -253,7 +253,7 @@ def schedule(self, testers): j_dag = jobs.createJobs(testers) # Allow derived schedulers access to the jobs before they launch - self.augmentJobs(jobs) + self.augmentJobs(jobs.getJobs()) # job-count to tester-count sanity check if j_dag.size() != len(testers): diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index 0fd11c3713aa..e5dd547e1703 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -25,10 +25,10 @@ def validParams(): params = MooseObject.validParams() # Common Options - params.addRequiredParam('type', "The type of test of Tester to create for this test.") - params.addParam('max_time', int(os.getenv('MOOSE_TEST_MAX_TIME', 300)), "The maximum in seconds that the test will be allowed to run.") - params.addParam('skip', "Provide a reason this test will be skipped.") - params.addParam('deleted', "Tests that only show up when using the '-e' option (Permanently skipped or not implemented).") + params.addRequiredParam('type', "The type of test of Tester to create for this test.") + params.addParam('max_time', Tester.getDefaultMaxTime(), "The maximum in seconds that the test will be allowed to run.") + params.addParam('skip', "Provide a reason this test will be skipped.") + params.addParam('deleted', "Tests that only show up when using the '-e' option (Permanently skipped or not implemented).") params.addParam('unique_test_id', "The unique hash given to a test") params.addParam('heavy', False, "Set to True if this test should only be run when the '--heavy' option is used.") @@ -301,6 +301,19 @@ def getMaxTime(self): """ return maximum time elapse before reporting a 'timeout' status """ return float(self.specs['max_time']) + def setMaxTime(self, value): + """ + Sets the max time for the job + """ + self.specs['max_time'] = float(value) + + @staticmethod + def getDefaultMaxTime(): + """ + Gets the default max run time + """ + return int(os.getenv('MOOSE_TEST_MAX_TIME', 300)) + def getUniqueTestID(self): """ return unique hash for test """ return self.specs['unique_test_id'] @@ -329,9 +342,9 @@ def getCheckInput(self): def setValgrindMode(self, mode): """ Increase the alloted time for tests when running with the valgrind option """ if mode == 'NORMAL': - self.specs['max_time'] = float(self.specs['max_time']) * 2 + self.setMaxTime(self.getMaxTime() * 2) elif mode == 'HEAVY': - self.specs['max_time'] = float(self.specs['max_time']) * 6 + self.setMaxTime(self.getMaxTime() * 6) def checkRunnable(self, options): """ From 0be40ac0f30a80806e1a02e3205ff16e03163ec1 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 16 May 2024 11:21:53 -0600 Subject: [PATCH 085/243] Fix and simplify parallel scheduling --- python/TestHarness/JobDAG.py | 44 +++++----------------- python/TestHarness/schedulers/Scheduler.py | 10 ++++- 2 files changed, 19 insertions(+), 35 deletions(-) diff --git a/python/TestHarness/JobDAG.py b/python/TestHarness/JobDAG.py index 17946d7e3f0b..6bbd2317cca9 100644 --- a/python/TestHarness/JobDAG.py +++ b/python/TestHarness/JobDAG.py @@ -9,34 +9,16 @@ from .schedulers.Job import Job from contrib import dag -import pyhit -import os import sys import threading class JobDAG(object): """ Class which builds a Job DAG for use by the Scheduler """ - def __init__(self, options): + def __init__(self, options, parallel_scheduling): + self.options = options + self.__parallel_scheduling = parallel_scheduling self.__job_dag = dag.DAG() - self.__parallel_scheduling = None self.__j_lock = threading.Lock() - self.options = options - - def _setParallel(self): - """ Read the test spec file and determine if parallel_scheduling is set. """ - if self.__parallel_scheduling is not None: - return self.__parallel_scheduling - self.__parallel_scheduling = False - - job = self.getJob() - if job: - # We only need a single tester so we know what spec file to load. - # TODO: would be nice to have access to this without needing tester.specs - tester = job[0].getTester() - root = pyhit.load(os.path.join(tester.specs['test_dir'], tester.specs['spec_file'])) - self.__parallel_scheduling = root.children[0].get('parallel_scheduling', False) - - return self.__parallel_scheduling def getLock(self): """ Return the lock for this test spec (folder of jobs) """ @@ -44,7 +26,7 @@ def getLock(self): def canParallel(self): """ Return bool whether or not this group runs in parallel """ - return self._setParallel() + return self.__parallel_scheduling def createJobs(self, testers): """ Return a usable Job DAG based on supplied list of tester objects """ @@ -66,18 +48,12 @@ def getDAG(self): """ return the running DAG object """ return self.__job_dag - def getJobs(self): + def getAvailableJobs(self): """ Return a list of available jobs """ + available_jobs = [job for job in self.__job_dag.ind_nodes() if job.isHold()] if self.canParallel() and not self.options.pedantic_checks: - return self.__job_dag.ind_nodes() - return self.getJob() - - def getJob(self): - """ Return a single available job """ - concurrent_jobs = self.__job_dag.ind_nodes() - if [x for x in concurrent_jobs if x.isHold()]: - return [[x for x in concurrent_jobs if x.isHold()][0]] - return [] + return available_jobs + return available_jobs[0:1] def getJobsAndAdvance(self): """ @@ -92,7 +68,7 @@ def getJobsAndAdvance(self): next_jobs.add(job) self.__job_dag.delete_node(job) - next_jobs.update(self.getJobs()) + next_jobs.update(self.getAvailableJobs()) return next_jobs def removeAllDependencies(self): @@ -298,7 +274,7 @@ def _doRaceConditions(self): # Multiple tests will clobber eachothers output file # Only check this with parallel_scheduling enabled because otherwise # all of these jobs will be serialized - elif len(job_list) > 1 and self._setParallel(): + elif len(job_list) > 1 and self.canParallel(): for job in job_list: job.setOutput('Output file will over write pre-existing output file:\n\t%s\n' % (outfile)) job.setStatus(job.error, 'OUTFILE RACE CONDITION') diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index 38652191888b..1e6557ad1e82 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -15,6 +15,7 @@ from timeit import default_timer as clock from multiprocessing.pool import ThreadPool import threading # for thread locking and thread timers +import pyhit class SchedulerError(Exception): pass @@ -247,9 +248,16 @@ def schedule(self, testers): # If we are not to schedule any more jobs for some reason, return now if self.__error_state: return + # Nothing to do if there aren't any testers + if not testers: + return + + # Whether or not we have parallel scheduling + root = pyhit.load(testers[0].getSpecFile()) + parallel_scheduling = root.children[0].get('parallel_scheduling', False) # Instance our job DAG, create jobs, and a private lock for this group of jobs (testers) - jobs = JobDAG(self.options) + jobs = JobDAG(self.options, parallel_scheduling) j_dag = jobs.createJobs(testers) # Allow derived schedulers access to the jobs before they launch From 2b4049f2039faa4b84ba350386a14873601ee66d Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 16 May 2024 12:23:25 -0600 Subject: [PATCH 086/243] Try a little harder to get the output, even when things fail --- python/TestHarness/runners/HPCRunner.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index b228c26e86d1..42167ece2e5a 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -26,6 +26,9 @@ def __init__(self, job, options, run_hpc): # Interval in seconds for polling for file completion self.file_completion_poll_time = 0.1 + # Whether or not the primary output has been loaded fully + self.output_completed = False + def spawn(self, timer): # Rely on the RunHPC object to submit the job self.run_hpc.submitJob(self.job) @@ -59,8 +62,12 @@ def wait(self, timer): if self.exit_code is None: self.exit_code = -1 - # If we have output, we should try to add it - self.trySetOutput() + # If we have _some_ output, at least try to load it. + # Try this for 10s and then call it a loss + for i in range(40): + if self.trySetOutput(): + break + time.sleep(0.25) # Don't bother looking for the rest of the output return @@ -103,7 +110,7 @@ def wait(self, timer): # We've waited for files for too long if (wait_files or incomplete_files) and waited_time >= self.options.hpc_file_timeout: self.job.setStatus(self.job.error, 'FILE TIMEOUT') - if self.output is None: + if not self.output_completed: self.trySetOutput() def print_files(files, type): if files: @@ -139,7 +146,6 @@ def trySetOutput(self, required=False): # Whether or not we actually set it did_set = False - # Only do something if the output actually exists output_file = self.run_hpc.getHPCJobOutputPath(self.job) if os.path.exists(output_file) and os.path.isfile(output_file): try: @@ -161,9 +167,12 @@ def trySetOutput(self, required=False): except: pass - # If required and we didn't load it, mark this error - if required and not did_set: - self.job.setStatus(self.job.error, 'FAILED OUTPUT READ') + if did_set: + self.output_completed = True + else: + self.output = f'Failed to load output file {output_file}\n' + if required: + self.job.setStatus(self.job.error, 'FAILED OUTPUT READ') return did_set From 1748f394d3aa6c7764d46f3ff8d5fc673ba320ce Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 16 May 2024 12:31:52 -0600 Subject: [PATCH 087/243] Reduce truncation to 500 lines --- python/TestHarness/runners/HPCRunner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index 42167ece2e5a..6a6e4fb572db 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -277,7 +277,7 @@ def getLastLine(file): return line, pos @staticmethod - def readTruncated(file, start_lines=1000, end_lines=1000): + def readTruncated(file, start_lines=500, end_lines=500): """ Reads a file and truncates it past a certain amount of lines. """ From 51d433184eb5247ef01a669c8db0d82f70e42af2 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 16 May 2024 12:38:54 -0600 Subject: [PATCH 088/243] Try to remove null output --- python/TestHarness/schedulers/Job.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 7a8e50abe56e..2c435c80e60c 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -252,6 +252,15 @@ def run(self): self.__end_time = self.timer.ends[-1] self.__joined_out = self.__tester.getOutput() + # Remove NULL output and fail if it exists + if self.__joined_out: + null_chars = ['\0', '\x00'] + for null_char in null_chars: + if null_char in self.__joined_out: + self.__joined_out = self.__joined_out.replace(null_char, 'NULL') + if not self.isFail(): + self.setStatus(self.error, f'NULL characters in output') + if self.options.pedantic_checks and self.canParallel(): # Check if the files we checked on earlier were modified. self.fileChecker.get_all_files(self, self.fileChecker.getNewTimes()) From f3e87bd806443db451aa764a8ee926872ad4d3bc Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 16 May 2024 13:17:01 -0600 Subject: [PATCH 089/243] Change placement to free --- python/TestHarness/schedulers/RunPBS.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index c907aa82dcb3..3b1a00397caf 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -45,7 +45,7 @@ def _submitJob(self, job, job_data): 'PROJECT': self.options.pbs_project, 'OUTPUT': job_data.output_file, 'SUBMISSION_SCRIPT': job_data.submission_script, - 'PLACE': 'scatter', + 'PLACE': 'free', 'TEST_SPEC': tester.getSpecFile(), 'TEST_NAME': tester.getTestNameShort(), 'SUBMITTED_HOSTNAME': socket.gethostname(), From de4d594000ae267f633ad4feef6800b43861fb3d Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 16 May 2024 13:39:40 -0600 Subject: [PATCH 090/243] Make placement a CLI option and make the default free --- python/TestHarness/TestHarness.py | 1 + python/TestHarness/schedulers/RunPBS.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 8b082f2c2859..c2d39d82a029 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -1063,6 +1063,7 @@ def parseCLArgs(self, argv): hpcgroup.add_argument('--hpc-host', nargs=1, action='store', dest='hpc_host', metavar='', help='The host to use for submitting HPC jobs') hpcgroup.add_argument('--hpc-pre-source', nargs=1, action="store", dest='hpc_pre_source', metavar='', help='Source specified file before launching HPC tests') hpcgroup.add_argument('--hpc-file-timeout', nargs=1, type=int, action='store', dest='hpc_file_timeout', default=120, help='The time in seconds to wait for HPC output') + hpcgroup.add_arugment('--hpc-place', nargs=1, action='store', dest='hpc_place', choices=['free', 'pack', 'scatter'], default='free', help='The default placement method for HPC jobs') hpcgroup.add_argument('--pbs-project', nargs=1, action='store', dest='pbs_project', type=str, default='moose', metavar='', help='Identify your job(s) with this project (default: %(default)s)') hpcgroup.add_argument('--pbs-queue', nargs=1, action='store', dest='hpc_queue', type=str, metavar='', help='Submit jobs to the specified queue') diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 3b1a00397caf..aa51e6bf5ee8 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -45,7 +45,7 @@ def _submitJob(self, job, job_data): 'PROJECT': self.options.pbs_project, 'OUTPUT': job_data.output_file, 'SUBMISSION_SCRIPT': job_data.submission_script, - 'PLACE': 'free', + 'PLACE': options.hpc_place, 'TEST_SPEC': tester.getSpecFile(), 'TEST_NAME': tester.getTestNameShort(), 'SUBMITTED_HOSTNAME': socket.gethostname(), From 85775b67527683f33d84ab46044c793e58d31dac Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 16 May 2024 13:39:50 -0600 Subject: [PATCH 091/243] Try harder to read output for PBS killed jobs --- python/TestHarness/runners/HPCRunner.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index 6a6e4fb572db..a93263c63dfc 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -63,11 +63,10 @@ def wait(self, timer): self.exit_code = -1 # If we have _some_ output, at least try to load it. - # Try this for 10s and then call it a loss - for i in range(40): + for i in range(int(self.options.hpc_file_timeout / self.file_completion_poll_time)): if self.trySetOutput(): break - time.sleep(0.25) + time.sleep(self.file_completion_poll_time) # Don't bother looking for the rest of the output return From cb5e13f5767a616930c37ca335ca1fc447c7b4af Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 16 May 2024 13:42:23 -0600 Subject: [PATCH 092/243] Spell method correctly --- python/TestHarness/TestHarness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index c2d39d82a029..3eb82f14351b 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -1063,7 +1063,7 @@ def parseCLArgs(self, argv): hpcgroup.add_argument('--hpc-host', nargs=1, action='store', dest='hpc_host', metavar='', help='The host to use for submitting HPC jobs') hpcgroup.add_argument('--hpc-pre-source', nargs=1, action="store", dest='hpc_pre_source', metavar='', help='Source specified file before launching HPC tests') hpcgroup.add_argument('--hpc-file-timeout', nargs=1, type=int, action='store', dest='hpc_file_timeout', default=120, help='The time in seconds to wait for HPC output') - hpcgroup.add_arugment('--hpc-place', nargs=1, action='store', dest='hpc_place', choices=['free', 'pack', 'scatter'], default='free', help='The default placement method for HPC jobs') + hpcgroup.add_argument('--hpc-place', nargs=1, action='store', dest='hpc_place', choices=['free', 'pack', 'scatter'], default='free', help='The default placement method for HPC jobs') hpcgroup.add_argument('--pbs-project', nargs=1, action='store', dest='pbs_project', type=str, default='moose', metavar='', help='Identify your job(s) with this project (default: %(default)s)') hpcgroup.add_argument('--pbs-queue', nargs=1, action='store', dest='hpc_queue', type=str, metavar='', help='Submit jobs to the specified queue') From d37b6309d048a927965caa16f79abbd941c6bfab Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 16 May 2024 13:51:12 -0600 Subject: [PATCH 093/243] Skip install tests on HPC --- test/tests/make_install/tests | 1 + 1 file changed, 1 insertion(+) diff --git a/test/tests/make_install/tests b/test/tests/make_install/tests index 0b894bf64125..3faabad200c8 100644 --- a/test/tests/make_install/tests +++ b/test/tests/make_install/tests @@ -34,6 +34,7 @@ mkdir -p ../../../make_install_test' use_shell = True installation_type = IN_TREE + hpc = False detail = 'from a pre-determined user-readable location;' [] From f2db9ab318d3ce6541f76a544af80c5d7e4504e8 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Sun, 19 May 2024 11:30:44 -0600 Subject: [PATCH 094/243] Disable affinity --- python/TestHarness/schedulers/hpc_source | 1 + 1 file changed, 1 insertion(+) diff --git a/python/TestHarness/schedulers/hpc_source b/python/TestHarness/schedulers/hpc_source index 30c6242e3d10..9d7d28becf61 100644 --- a/python/TestHarness/schedulers/hpc_source +++ b/python/TestHarness/schedulers/hpc_source @@ -1 +1,2 @@ module load use.moose moose-dev-container +export MV2_ENABLE_AFFINITY=0 From fb73ffd4a83dc180fc360ea0c0d9e6fc99d10a91 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 20 May 2024 08:33:07 -0600 Subject: [PATCH 095/243] Support multiple HPC hosts for redundancy --- python/TestHarness/TestHarness.py | 2 +- python/TestHarness/schedulers/RunHPC.py | 97 +++++++++++++++---------- python/TestHarness/schedulers/RunPBS.py | 17 ++--- python/TestHarness/testers/Tester.py | 2 +- 4 files changed, 68 insertions(+), 50 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 3eb82f14351b..1162d5ce90ff 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -1060,7 +1060,7 @@ def parseCLArgs(self, argv): # Options for HPC execution hpcgroup = parser.add_argument_group('HPC Options', 'Options controlling HPC execution') hpcgroup.add_argument('--hpc', dest='hpc', action='store', choices=['pbs'], help='Launch tests using a HPC scheduler') - hpcgroup.add_argument('--hpc-host', nargs=1, action='store', dest='hpc_host', metavar='', help='The host to use for submitting HPC jobs') + hpcgroup.add_argument('--hpc-host', nargs='+', action='store', dest='hpc_host', metavar='', help='The host(s) to use for submitting HPC jobs') hpcgroup.add_argument('--hpc-pre-source', nargs=1, action="store", dest='hpc_pre_source', metavar='', help='Source specified file before launching HPC tests') hpcgroup.add_argument('--hpc-file-timeout', nargs=1, type=int, action='store', dest='hpc_file_timeout', default=120, help='The time in seconds to wait for HPC output') hpcgroup.add_argument('--hpc-place', nargs=1, action='store', dest='hpc_place', choices=['free', 'pack', 'scatter'], default='free', help='The default placement method for HPC jobs') diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index b8ba7da3d631..22ac909917d1 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -41,40 +41,44 @@ def __init__(self, harness, params): self.hpc_jobs = {} # The jump hostname for running commands, if any - self.ssh_host = self.options.hpc_host + self.ssh_hosts = self.options.hpc_host # The SSH key to use for connections - self.ssh_key_filename = None + self.ssh_key_filenames = None # The pool of processes for running threaded SSH comments self.ssh_pool = None # The threaded SSHClient objects, mapped by thread identifier + # Tuple of (paramiko.SSHClient, str) where str is the hostname self.ssh_clients = None # The lock for calling commands via SSH, self.ssh_clients_lock = None # Setup the jump host if provided - if self.ssh_host: + # We allow multitple hosts here to have backups + if self.ssh_hosts: + if isinstance(self.ssh_hosts, str): + self.ssh_hosts = [self.ssh_hosts] self.ssh_pool = ThreadPool(processes=5) self.ssh_clients = {} self.ssh_clients_lock = threading.Lock() - # Try to find a key to use - try: - ssh_config = os.path.expanduser('~/.ssh/config') - config = paramiko.SSHConfig.from_path(ssh_config).lookup(self.ssh_host) - identityfile = config.get('identityfile') - if identityfile is not None and len(identityfile) > 0: - self.ssh_key_filename = identityfile[-1] - except: - pass + # Try to find a key to use for each host. Paramiko doesn't + # use any non-default keys by default, so we need to search + # like this and apply them manually + self.ssh_key_filenames = {} + for host in self.ssh_hosts: + try: + ssh_config = os.path.expanduser('~/.ssh/config') + config = paramiko.SSHConfig.from_path(ssh_config).lookup(host) + identityfile = config.get('identityfile') + if identityfile is not None and len(identityfile) > 0: + self.ssh_key_filenames[host] = identityfile[-1] + except: + pass # Make sure that we can connect up front - try: - self.callHPC('hostname') - except: - print(f'Failed to connect to HPC host {self.ssh_host}') - sys.exit(1) + self.callHPC('hostname') if os.environ.get('APPTAINER_CONTAINER'): - if not self.ssh_host: + if not self.ssh_hosts: print('ERROR: --hpc-host must be set when using HPC jobs within apptainer') sys.exit(1) if not self.options.hpc_pre_source: @@ -121,10 +125,10 @@ class CallHPCException(Exception): """ Exception class for providing extra context for HPC submission errors """ - def __init__(self, run_hpc, description, command, result=None): + def __init__(self, description, host, command, result=None): message = f'{description}' - if run_hpc.ssh_host: - message += f' on host "{run_hpc.ssh_host}"' + if host: + message += f' on host "{host}"' message += f'\nCommand: {command}' if result: message += f'\n\nResult:\n{result}' @@ -139,10 +143,23 @@ def _getSSHClient(self, reconnect=False): process = threading.get_ident() with self.ssh_clients_lock: if process not in self.ssh_clients or reconnect: - self.ssh_clients[process] = paramiko.SSHClient() - self.ssh_clients[process].set_missing_host_key_policy(paramiko.AutoAddPolicy()) - self.ssh_clients[process].connect(self.ssh_host, key_filename=self.ssh_key_filename) - return self.ssh_clients.get(process) + self.ssh_clients[process] = None + for host in self.ssh_hosts: + client = paramiko.SSHClient() + client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + key_filename = self.ssh_key_filenames.get(host) + try: + client.connect(host, key_filename=key_filename) + except Exception as e: + print(f'WARNING: Failed to connect to HPC host {host}: {e}') + continue + self.ssh_clients[process] = (client, host) + break + + client_and_host = self.ssh_clients.get(process) + if client_and_host is None: + raise Exception('Failed to connect to SSH host(s) ', ', '.join(self.ssh_hosts)) + return client_and_host def _callSSH(self, command): """ @@ -150,32 +167,34 @@ def _callSSH(self, command): Should only be used via apply with the self.ssh_pool. """ - client = self._getSSHClient() - try: - _, stdout, stderr = client.exec_command(command) - # SSH connection might have died, so try to create a new one - except paramiko.ssh_exception.SSHException: + client, host = self._getSSHClient() + + # Here we try twice, in the event that the connection was killed + retry = False + while True: try: - client = self._getSSHClient(reconnect=True) + client, host = self._getSSHClient(reconnect=retry) _, stdout, stderr = client.exec_command(command) except Exception as e: - raise RunHPC.CallHPCException(self, 'Failed to execute remote command', command) from e - # An even worse failure happened here - except Exception as e: - raise RunHPC.CallHPCException(self, 'Failed to execute remote command', command) from e + if not retry: + retry = True + continue + raise RunHPC.CallHPCException('Failed to execute remote command', host, command) from e + break exit_code = stdout.channel.recv_exit_status() result = ''.join(stdout.readlines()) if exit_code != 0: result += ''.join(stderr.readlines()) - return exit_code, result.rstrip() + full_command = f"ssh {host} '{command}'" + return exit_code, result.rstrip(), full_command def callHPC(self, command): """ Wrapper for calling a HPC command (qsub, qstat, etc) that supports SSH-ing to another host as needed when calling from within apptainer """ - if not self.ssh_host: + if not self.ssh_hosts: raise Exception('HPC not currently supported outside of a container') return self.ssh_pool.apply(self._callSSH, (command,)) @@ -289,7 +308,7 @@ def submitJob(self, job): job_data.additional_output_files = ' '.join(additional_output) # Let the derived class actually submit the job - job_id = self._submitJob(job, job_data) + job_id, submit_command = self._submitJob(job, job_data) # Job has been submitted, so set it as queued job.addCaveats(job_id) @@ -301,6 +320,8 @@ def submitJob(self, job): raise Exception('Job has already been submitted') self.hpc_jobs[job] = self.HPCJob(job_id, job_data.command) + return submit_command + def _submitJob(self, job, job_data): """ Submits a given job. diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index aa51e6bf5ee8..4b2b4c381bd9 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -86,17 +86,14 @@ def _submitJob(self, job, job_data): 'exit 1'] qsub_command = '; '.join(qsub_command) - # Set what we've ran for this job so that we can - # potentially get the context in an error - command_ran = qsub_command - if self.ssh_host: - command_ran = f"ssh {self.ssh_host} '{qsub_command}'" - job.getTester().setCommandRan(command_ran) - # Do the submission; this is thread safe # Eventually we might want to make this a pool so we can submit multiple # jobs at the same time - exit_code, result = self.callHPC(qsub_command) + exit_code, result, full_qsub_command = self.callHPC(qsub_command) + + # Set what we've ran for this job so that we can + # potentially get the context in an error + job.getTester().setCommandRan(full_qsub_command) # Nonzero return code if exit_code != 0: @@ -108,7 +105,7 @@ def _submitJob(self, job, job_data): if not search: raise self.CallHPCException(self, f'qsub has unexpected ID "{job_id}"', qsub_command) - return job_id + return job_id, full_qsub_command def updateJobs(self): # Obtain the IDs of jobs that are active that we need to poll for @@ -119,7 +116,7 @@ def updateJobs(self): # Poll for all of the jobs within a single call cmd = ['qstat', '-xf', '-F', 'json'] + active_job_ids - exit_code, result = self.callHPC(' '.join(cmd)) + exit_code, result, _ = self.callHPC(' '.join(cmd)) if exit_code != 0: raise self.CallHPCException(self, 'Failed to get job status', cmd, result) diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index e5dd547e1703..525357b1a760 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -204,7 +204,7 @@ def getOutput(self) -> str: def isOutputReady(self) -> bool: """Returns whether or not the output is ready for reading""" - return self._runner.isOutputReady() + return self._runner is not None and self._runner.isOutputReady() def getExitCode(self) -> int: """Gets the exit code of the command that was ran""" From 817d4fafe7f696924f4fae79a0b73278ea7e2b32 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 20 May 2024 08:52:12 -0600 Subject: [PATCH 096/243] Use new error --- python/TestHarness/tests/test_UnknownPrereq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/tests/test_UnknownPrereq.py b/python/TestHarness/tests/test_UnknownPrereq.py index f10b7058247b..1da1d53a1fae 100644 --- a/python/TestHarness/tests/test_UnknownPrereq.py +++ b/python/TestHarness/tests/test_UnknownPrereq.py @@ -19,4 +19,4 @@ def testUnknownPrereq(self): self.runTests('-i', 'unknown_prereq') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.foo.*?FAILED \(unknown dependency\)') + self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.foo.*?FAILED \(unknown dependency non_existent\)') From 86c349135e7dce1ae091b2e64913f9193e991882 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 20 May 2024 09:15:17 -0600 Subject: [PATCH 097/243] Use job ID in the terminator --- python/TestHarness/runners/HPCRunner.py | 13 ++++++++----- python/TestHarness/schedulers/RunHPC.py | 9 ++------- python/TestHarness/schedulers/RunPBS.py | 5 ++++- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index a93263c63dfc..5ca57bcfaaee 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -20,6 +20,9 @@ def __init__(self, job, options, run_hpc): # The RunHPC object self.run_hpc = run_hpc + # The HPCJob object, updated in wait() + self.hpc_job = None + # Interval in seconds for polling for job status self.job_status_poll_time = 0.1 @@ -44,11 +47,11 @@ def wait(self, timer): # polling itself is only done on occasion within RunHPC while True: time.sleep(self.job_status_poll_time) - hpc_job = self.run_hpc.getHPCJob(self.job) + self.hpc_job = self.run_hpc.getHPCJob(self.job) # We're done - if hpc_job.done: - self.exit_code = hpc_job.exit_code + if self.hpc_job.done: + self.exit_code = self.hpc_job.exit_code break timer.stop() @@ -75,7 +78,7 @@ def wait(self, timer): # We've actually ran something now and not just qsub, so update the # command to what was ran there - tester.setCommandRan(hpc_job.command) + tester.setCommandRan(self.hpc_job.command) # Determine the output files that we need to wait for to be complete wait_files = set([output_file]) @@ -194,7 +197,7 @@ def fileIsReady(self, file): if is_binary is None: return False - ending_comment = self.run_hpc.getOutputEndingComment() + ending_comment = self.run_hpc.getOutputEndingComment(self.hpc_job.id) # Binary file if is_binary: diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 22ac909917d1..c5defb7df82e 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -395,13 +395,8 @@ def getHPCJobSubmissionPath(job): """Gets the aboslute path for the qsub script for a HPC job""" return RunHPC.getHPCJobOutputPathPrefix(job) + '.qsub' - @staticmethod - def getOutputEndingComment() -> str: - """ - Gets the text we append to the stderr+stdout file to desginate - that it is complete - """ - return 'TESTHARNESS RUNHPC FILE TERMINATOR' + def getOutputEndingComment(self, job_id) -> str: + raise Exception('Unimplemented getOutputEndingComment()') @staticmethod def parseMPICommand(command) -> str: diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 4b2b4c381bd9..852a71608553 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -52,7 +52,7 @@ def _submitJob(self, job, job_data): 'CWD': tester.getTestDir(), 'COMMAND': job_data.command, 'COMMAND_PRINTABLE': job_data.command_printable, - 'ENDING_COMMENT': self.getOutputEndingComment(), + 'ENDING_COMMENT': self.getOutputEndingComment('${PBS_JOBID}'), 'MOOSE_PYTHONPATH': moose_python, 'ADDITIONAL_OUTPUT_FILES': job_data.additional_output_files} if self.options.hpc_queue: @@ -198,3 +198,6 @@ def killRemaining(self, keyboard=False): hpc_job.killed = True RunParallel.killRemaining(self, keyboard) + + def getOutputEndingComment(self, job_id): + return f'TESTHARNESS RUNPBS FILE TERMINATOR FOR {job_id}' From 85fe0e11cd279f3065ab8ade97b302c164bf6250 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 20 May 2024 09:40:48 -0600 Subject: [PATCH 098/243] Only print the caveats when requested --- python/TestHarness/TestHarness.py | 11 ++++++----- python/TestHarness/schedulers/RunHPC.py | 2 +- python/TestHarness/schedulers/RunPBS.py | 2 +- python/TestHarness/schedulers/Scheduler.py | 4 ++-- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 1162d5ce90ff..fa1bdbf4d1aa 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -621,7 +621,7 @@ def printOutput(self, job, color): print(output) return output - def handleJobStatus(self, job): + def handleJobStatus(self, job, caveats=None): """ The Scheduler is calling back the TestHarness to inform us of a status change. The job may or may not be finished yet (RUNNING), or failing, passing, etc. @@ -637,8 +637,9 @@ def handleJobStatus(self, job): # perform printing of application output if so desired self.printOutput(job, color) - # Print status with caveats - print((util.formatResult(job, self.options, caveats=True))) + # Print status with caveats (if caveats not overridden) + caveats = True if caveats is None else caveats + print((util.formatResult(job, self.options, caveats=caveats))) timing = job.getTiming() @@ -657,8 +658,8 @@ def handleJobStatus(self, job): # Just print current status without saving results else: - # TODO: changed this caveats=True - print((util.formatResult(job, self.options, result=job.getStatus().status, caveats=True))) + caveats = False if caveats is None else caveats + print((util.formatResult(job, self.options, result=job.getStatus().status, caveats=caveats))) # Print final results, close open files, and exit with the correct error code def cleanup(self): diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index c5defb7df82e..2912d96d473e 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -312,7 +312,7 @@ def submitJob(self, job): # Job has been submitted, so set it as queued job.addCaveats(job_id) - self.setAndOutputJobStatus(job, job.queued) + self.setAndOutputJobStatus(job, job.queued, caveats=True) # Setup the job in the status map with self.hpc_jobs_lock: diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 852a71608553..1dea559be71f 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -143,7 +143,7 @@ def updateJobs(self): # Get the job state, and report running if it switched to running if state == 'R' and not pbs_job.running: pbs_job.running = True - self.setAndOutputJobStatus(job, job.running) + self.setAndOutputJobStatus(job, job.running, caveats=True) # If we were running but now we're done, we're not running anymore if pbs_job.running and done: diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index 1e6557ad1e82..3eef8f8a2013 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -192,14 +192,14 @@ def __sortAndLaunch(self): for jobs, _ in sorted_jobs: self.queueJobs(jobs) - def setAndOutputJobStatus(self, job, status): + def setAndOutputJobStatus(self, job, status, caveats=None): """ Sets a Job's status and forces the status to be output asap """ with job.getLock(): job.setStatus(status) job.force_report_status = True - self.handleJobStatus(job) + self.handleJobStatus(job, caveats=caveats) def waitFinish(self): """ From 7510fdadd5e9b3d5c46bc0ac1fd9fc39aa547d6f Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 20 May 2024 10:28:59 -0600 Subject: [PATCH 099/243] Use more versatile race condition checker This checks _all_ inputs, not necessarily the ones that are to be run --- python/TestHarness/JobDAG.py | 36 ----------------- python/TestHarness/schedulers/Job.py | 6 +-- python/TestHarness/testers/Tester.py | 2 +- python/TestHarness/tests/test_Duplicate.py | 40 ++++--------------- python/TestHarness/tests/test_InstallType.py | 13 +++--- python/TestHarness/tests/test_MachineType.py | 5 ++- .../test_harness/duplicate_outputs_prereqs | 27 ------------- .../test_harness/multiple_duplicate_outputs | 7 ---- 8 files changed, 21 insertions(+), 115 deletions(-) delete mode 100644 test/tests/test_harness/duplicate_outputs_prereqs delete mode 100644 test/tests/test_harness/multiple_duplicate_outputs diff --git a/python/TestHarness/JobDAG.py b/python/TestHarness/JobDAG.py index 6bbd2317cca9..16ee681c0e36 100644 --- a/python/TestHarness/JobDAG.py +++ b/python/TestHarness/JobDAG.py @@ -92,10 +92,6 @@ def _checkDAG(self): # Remove edges for jobs that are skipped self._doSkippedDependencies() - # If there are race conditions, then there may be more skipped jobs - if self._doRaceConditions(): - self._doSkippedDependencies() - return self.__job_dag def _addEdge(self, child, parent): @@ -247,38 +243,6 @@ def _checkOutputCollisions(self): print(' between your tests with the "prereq" parameter') sys.exit(1) - def _doRaceConditions(self): - """ Check for race condition errors within in the DAG""" - # Build output_file in relation to job dictionary - output_to_job = {} - for job in self.getJobs(): - if job.getRunnable() and not job.isFinished(): - for output_file in job.getOutputFiles(self.options): - output_to_job[output_file] = output_to_job.get(output_file, []) - output_to_job[output_file].append(job) - - # Remove jobs which have accurate dependencies - for outfile, job_list in output_to_job.items(): - for job in list(job_list): - for match_job in self.__job_dag.all_downstreams(job): - if match_job in job_list: - job_list.remove(match_job) - - # Left over multiple items in job_list are problematic - for outfile, job_list in output_to_job.items(): - # Same test has duplicate output files - if len(job_list) > 1 and len(set(job_list)) == 1: - job_list[0].setOutput('Duplicate output files:\n\t%s\n' % (outfile)) - job_list[0].setStatus(job.error, 'DUPLICATE OUTFILES') - - # Multiple tests will clobber eachothers output file - # Only check this with parallel_scheduling enabled because otherwise - # all of these jobs will be serialized - elif len(job_list) > 1 and self.canParallel(): - for job in job_list: - job.setOutput('Output file will over write pre-existing output file:\n\t%s\n' % (outfile)) - job.setStatus(job.error, 'OUTFILE RACE CONDITION') - def _skipPrereqs(self): """ Method to return boolean to skip dependency prerequisites checks. diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 2c435c80e60c..9e2b28ab5b76 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -280,7 +280,7 @@ def getEndTime(self): def getOutput(self): """ Return the contents of output """ - return self.__joined_out + return self.__joined_out if self.__joined_out else '' def getOutputFile(self): """ Return the output file path """ @@ -317,13 +317,13 @@ def setOutput(self, output, force=False): def getActiveTime(self): """ Return active time """ - m = re.search(r"Active time=(\S+)", self.__joined_out) + m = re.search(r"Active time=(\S+)", self.getOutput()) if m != None: return float(m.group(1)) def getSolveTime(self): """ Return solve time """ - m = re.search(r"solve().*", self.__joined_out) + m = re.search(r"solve().*", self.getOutput()) if m != None: return m.group().split()[5] diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index 525357b1a760..d382f19a051d 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -291,7 +291,7 @@ def getTestDir(self): return self.specs['test_dir'] def getSpecFile(self): - return os.path.join(self.getTestDir(), self.specs['spec_file']) + return os.path.join(self.specs['test_dir'], self.specs['spec_file']) def getMinReportTime(self): """ return minimum time elapse before reporting a 'long running' status """ diff --git a/python/TestHarness/tests/test_Duplicate.py b/python/TestHarness/tests/test_Duplicate.py index 3ed824e69ecf..5e73b23a627e 100644 --- a/python/TestHarness/tests/test_Duplicate.py +++ b/python/TestHarness/tests/test_Duplicate.py @@ -18,18 +18,17 @@ def testDuplicateOutputs(self): with self.assertRaises(subprocess.CalledProcessError) as cm: self.runTests('-i', 'duplicate_outputs') - e = cm.exception + output = cm.exception.output.decode('utf-8') + self.assertIn('Tests: d, c', output) + self.assertIn('File(s): good_out.e', output) - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.*?FAILED \(OUTFILE RACE CONDITION\)') - - # Use a different spec file, which makes use of the AnalyzeJacobian tester. The is because - # a race condition, when caught, will invalidate the rest of the tests with out testing them. + # Use a different spec file, which makes use of the AnalyzeJacobian tester with self.assertRaises(subprocess.CalledProcessError) as cm: self.runTests('-i', 'duplicate_outputs_analyzejacobian') - e = cm.exception - - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.*?FAILED \(OUTFILE RACE CONDITION\)') + output = cm.exception.output.decode('utf-8') + self.assertIn('Tests: b, a', output) + self.assertIn('File(s): good.i', output) def testDuplicateOutputsOK(self): """ @@ -45,28 +44,3 @@ def testDuplicateOutputsOK(self): self.assertNotRegexpMatches(output.decode('utf-8'), 'heavy_out.e') # all self.assertNotRegexpMatches(output.decode('utf-8'), 'FATAL TEST HARNESS ERROR') - - def testDelayedDuplicateOutputs(self): - """ - Test a more complex, delayed, race condition by running three tests. Two which launch - immediately, and a third, waiting on one job to finish. When it does, this third test - will write to the same output file, that one of the other tests which is still running - is writing to. Thus, causing a delayed race condition. - """ - with self.assertRaises(subprocess.CalledProcessError) as cm: - self.runTests('-i', 'duplicate_outputs_prereqs') - - e = cm.exception - - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.*?FAILED \(OUTFILE RACE CONDITION\)') - - def testMultipleDuplicateOutputs(self): - """ - Test for multiple duplicate outputs created by one test - """ - with self.assertRaises(subprocess.CalledProcessError) as cm: - self.runTests('-i', 'multiple_duplicate_outputs') - - e = cm.exception - - self.assertRegex(e.output.decode('utf-8'), r'FAILED \(DUPLICATE OUTFILES\)') diff --git a/python/TestHarness/tests/test_InstallType.py b/python/TestHarness/tests/test_InstallType.py index 16796b0388d4..30bb7abbc78b 100644 --- a/python/TestHarness/tests/test_InstallType.py +++ b/python/TestHarness/tests/test_InstallType.py @@ -21,6 +21,7 @@ def mocked_output(self, mocked, expect_fail, mocked_return): out = io.StringIO() with redirect_stdout(out): mocked_return.return_value=mocked + os.environ['MOOSE_TERM_FORMAT'] = 'njCst' harness = TestHarness.TestHarness(['', '-i', 'install_type', '-c'], MOOSE_DIR) if expect_fail: with self.assertRaises(SystemExit): @@ -34,15 +35,15 @@ def testInstalled(self): Test which only runs if binary is installed """ out = self.mocked_output(set(['ALL', 'INSTALLED']), False) - self.assertRegex(out, r'.*?SKIP.*?in_tree_type.*?"IN_TREE" binary]') - self.assertRegex(out, r'.*?OK.*?installed_type') - self.assertRegex(out, r'.*?OK.*?all_type') + self.assertRegex(out, r'tests\/test_harness\.in_tree_type[\s.]+\[test requires "IN_TREE" binary\]\s+SKIP') + self.assertRegex(out, r'tests\/test_harness\.installed_type[\s.]+OK') + self.assertRegex(out, r'tests\/test_harness\.all_type[\s.]+OK') def testInTree(self): """ Test which only runs if binary is in_tree """ out = self.mocked_output(set(['ALL', 'IN_TREE']), False) - self.assertRegex(out, r'.*?SKIP.*?installed_type.*?"INSTALLED" binary]') - self.assertRegex(out, r'.*?OK.*?in_tree_type') - self.assertRegex(out, r'.*?OK.*?all_type') + self.assertRegex(out, r'tests\/test_harness\.in_tree_type[\s.]+OK') + self.assertRegex(out, r'tests\/test_harness\.installed_type[\s.]+\[test requires "INSTALLED" binary\]\s+SKIP') + self.assertRegex(out, r'tests\/test_harness\.all_type[\s.]+OK') diff --git a/python/TestHarness/tests/test_MachineType.py b/python/TestHarness/tests/test_MachineType.py index ac4026f4db35..dea14ae2506e 100644 --- a/python/TestHarness/tests/test_MachineType.py +++ b/python/TestHarness/tests/test_MachineType.py @@ -21,6 +21,7 @@ def mocked_output(self, mocked, expect_fail, mocked_return): out = io.StringIO() with redirect_stdout(out): mocked_return.return_value=mocked + os.environ['MOOSE_TERM_FORMAT'] = 'njCst' harness = TestHarness.TestHarness(['', '-i', 'always_ok', '-c'], MOOSE_DIR) if expect_fail: with self.assertRaises(SystemExit): @@ -34,11 +35,11 @@ def testNotSkipped(self): Test should not be skipped, as it is set to run on any arch (ALL) """ out = self.mocked_output(set(['ALL']), False) - self.assertRegex(out, r'.*?OK.*?always_ok') + self.assertRegex(out, r'tests\/test_harness\.always_ok[\s.]+OK') def testSkipped(self): """ Test that a non existing machine type is skipped (remove default of ALL) """ out = self.mocked_output(set(['']), False) - self.assertRegex(out, r'.*?SKIP.*?always_ok.*?MACHINE!=ALL') + self.assertRegex(out, r'tests\/test_harness\.always_ok[\s.]+\[MACHINE!=ALL\]\s+SKIP') diff --git a/test/tests/test_harness/duplicate_outputs_prereqs b/test/tests/test_harness/duplicate_outputs_prereqs deleted file mode 100644 index 962258e7a0c2..000000000000 --- a/test/tests/test_harness/duplicate_outputs_prereqs +++ /dev/null @@ -1,27 +0,0 @@ -[Tests] - # Needed because the default of false will cause the - # race condition checks to be skipped - parallel_scheduling = true - - [./a] - type = Exodiff - input = good.i - cli_args = "Outputs/file_base=good_exodiff_out Outputs/exodus=true" - exodiff = 'good_exodiff_out.e' - [../] - [./b] - type = Exodiff - input = good.i - exodiff = 'good_out.e' - prereq = 'a' - [../] - - # While there are no immediate race conditions when this spec file is launched, - # it is possible 'a' will finish quickly, allowing 'b' to run while 'c' might - # _still_ be running... and that is a race condition. - [./c] - type = Exodiff - input = good.i - exodiff = 'good_out.e' - [../] -[] diff --git a/test/tests/test_harness/multiple_duplicate_outputs b/test/tests/test_harness/multiple_duplicate_outputs deleted file mode 100644 index 0edc754ef2be..000000000000 --- a/test/tests/test_harness/multiple_duplicate_outputs +++ /dev/null @@ -1,7 +0,0 @@ -[Tests] - [./a] - type = Exodiff - input = good.i - exodiff = 'foo.e foo.e' - [../] -[] From a83626bf168f5be3bfae169677bfb8b2b8170eb4 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 20 May 2024 13:12:26 -0600 Subject: [PATCH 100/243] Forcefully set the output for --show-last-run --- python/TestHarness/schedulers/RunParallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/RunParallel.py b/python/TestHarness/schedulers/RunParallel.py index 62673b37eb8e..064f33cc5b97 100644 --- a/python/TestHarness/schedulers/RunParallel.py +++ b/python/TestHarness/schedulers/RunParallel.py @@ -48,7 +48,7 @@ def run(self, job): if caveats: tester.addCaveats(caveats) job.setPreviousTime(job_results['TIMING']) - job.setOutput(job_results['OUTPUT']) + job.setOutput(job_results['OUTPUT'], force=True) return output = '' From 2dddd9b3cc870c1050f79fba995bb5a8d88bdfa8 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 20 May 2024 14:30:48 -0600 Subject: [PATCH 101/243] Re-add page as removed --- .../application_development/performance_benchmarking.md | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 modules/doc/content/application_development/performance_benchmarking.md diff --git a/modules/doc/content/application_development/performance_benchmarking.md b/modules/doc/content/application_development/performance_benchmarking.md new file mode 100644 index 000000000000..83c27f13038d --- /dev/null +++ b/modules/doc/content/application_development/performance_benchmarking.md @@ -0,0 +1,4 @@ +# Performance Benchmarking + +!alert error title=Removed +This content has been removed. From e5936efadd5405f7b6663c7752a4d1fc82dbf5f4 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 20 May 2024 18:54:25 -0600 Subject: [PATCH 102/243] Fix arg for caveats --- python/TestHarness/schedulers/Scheduler.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index 3eef8f8a2013..7d54cfb435be 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -358,15 +358,15 @@ def handleTimeoutJob(self, job): job.setStatus(job.timeout, 'TIMEOUT') job.killProcess() - def handleJobStatus(self, job): + def handleJobStatus(self, job, caveats=None): """ Possibly reports a job's status. Whether or not it actually gets reported... is not so intuitive. """ - self.status_pool.apply_async(self.jobStatus, (job,)) + self.status_pool.apply_async(self.jobStatus, (job,caveats,)) - def jobStatus(self, job): + def jobStatus(self, job, caveats): """ Instruct the TestHarness to print the status of job. This is a serial threaded operation, so as to prevent clobbering of text being printed @@ -425,7 +425,7 @@ def jobStatus(self, job): return # Inform the TestHarness of job status - self.harness.handleJobStatus(job) + self.harness.handleJobStatus(job, caveats=caveats) # Reset activity clock if not job.isSilent(): From bf05296e8c42095f445db2966d555681395bc88b Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 20 May 2024 19:20:54 -0600 Subject: [PATCH 103/243] Check for application not found earlier and catch it --- python/TestHarness/schedulers/RunParallel.py | 1 - python/TestHarness/testers/RunApp.py | 3 +-- python/TestHarness/testers/Tester.py | 9 +++++++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/python/TestHarness/schedulers/RunParallel.py b/python/TestHarness/schedulers/RunParallel.py index 064f33cc5b97..8dd0815f3aab 100644 --- a/python/TestHarness/schedulers/RunParallel.py +++ b/python/TestHarness/schedulers/RunParallel.py @@ -30,7 +30,6 @@ def __init__(self, harness, params): def run(self, job): """ Run a tester command """ - tester = job.getTester() # Build and set the runner that will actually run the commands diff --git a/python/TestHarness/testers/RunApp.py b/python/TestHarness/testers/RunApp.py index c2544cdbe263..f0059dc25a0d 100644 --- a/python/TestHarness/testers/RunApp.py +++ b/python/TestHarness/testers/RunApp.py @@ -166,8 +166,7 @@ def getCommand(self, options): # Check for built application if shutil.which(specs['executable']) is None: - self.setStatus(self.fail, 'Application not found') - return '' + self.setStatus(self.fail, 'APPLICATION NOT FOUND') # If no_additional_cli_args is set to True, return early with a simplified command line ignoring # all other TestHarness supplied options. diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index d382f19a051d..7492c50a7b8d 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -449,11 +449,16 @@ def run(self, job, options, timer): if needed. The run method is responsible to call the start+stop methods on timer to record the time taken to run the actual test. start+stop can be called multiple times. """ - # Verify that the working directory is available right before we execute. + # Verify that the working directory is available right before we execute if not os.path.exists(self.getTestDir()): + self.setStatus(self.fail, 'WORKING DIRECTORY NOT FOUND') + # Getting the command can also cause a failure, so try that + self.getCommand(options) + + # If we've failed already, nothing to do here + if job.isFail(): # Timers must be used since they are directly indexed in the Job class timer.start() - self.setStatus(self.fail, 'WORKING DIRECTORY NOT FOUND') timer.stop() return From b5dd649d192ab4ecb9bc2eff00ff03ed48c5bd07 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 20 May 2024 20:38:55 -0600 Subject: [PATCH 104/243] Move terminal cols and format to CLI args to make tests robust --- python/TestHarness/TestHarness.py | 30 +++++++++++++++---- .../TestHarness/tests/TestHarnessTestCase.py | 6 ++-- python/TestHarness/tests/test_InstallType.py | 10 +++---- python/TestHarness/tests/test_MachineType.py | 6 ++-- python/TestHarness/util.py | 23 +++++--------- 5 files changed, 41 insertions(+), 34 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index fa1bdbf4d1aa..b82e329c62ea 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -666,18 +666,18 @@ def cleanup(self): # Print the results table again if a bunch of output was spewed to the screen between # tests as they were running if len(self.parse_errors) > 0: - print(('\n\nParser Errors:\n' + ('-' * (util.TERM_COLS)))) + print(('\n\nParser Errors:\n' + ('-' * (self.options.term_cols)))) for err in self.parse_errors: print((util.colorText(err, 'RED', html=True, colored=self.options.colored, code=self.options.code))) if (self.options.verbose or (self.num_failed != 0 and not self.options.quiet)) and not self.options.dry_run: - print(('\n\nFinal Test Results:\n' + ('-' * (util.TERM_COLS)))) + print(('\n\nFinal Test Results:\n' + ('-' * (self.options.term_cols)))) for (job, sort_value, timing) in sorted(self.test_table, key=lambda x: x[1]): print((util.formatResult(job, self.options, caveats=True))) time = clock() - self.start_time - print(('-' * (util.TERM_COLS))) + print(('-' * (self.options.term_cols))) # Mask off TestHarness error codes to report parser errors fatal_error = '' @@ -731,7 +731,7 @@ def cleanup(self): sorted_tups = sorted(self.test_table, key=lambda tup: float(tup[0].getTiming()), reverse=True) print('\n%d longest running jobs:' % self.options.longest_jobs) - print(('-' * (util.TERM_COLS))) + print(('-' * (self.options.term_cols))) # Copy the current options and force timing to be true so that # we get times when we call formatResult() below @@ -764,10 +764,10 @@ def cleanup(self): sorted_table = sorted(dag_table, key=lambda dag_table: float(dag_table[1]), reverse=True) if sorted_table[0:self.options.longest_jobs]: print(f'\n{self.options.longest_jobs} longest running folders:') - print(('-' * (util.TERM_COLS))) + print(('-' * (self.options.term_cols))) # We can't use util.formatResults, as we are representing a group of testers for group in sorted_table[0:self.options.longest_jobs]: - print(str(group[0]).ljust((util.TERM_COLS - (len(group[1]) + 4)), ' '), f'[{group[1]}s]') + print(str(group[0]).ljust((self.options.term_cols - (len(group[1]) + 4)), ' '), f'[{group[1]}s]') print('\n') # Perform any write-to-disc operations @@ -1068,6 +1068,24 @@ def parseCLArgs(self, argv): hpcgroup.add_argument('--pbs-project', nargs=1, action='store', dest='pbs_project', type=str, default='moose', metavar='', help='Identify your job(s) with this project (default: %(default)s)') hpcgroup.add_argument('--pbs-queue', nargs=1, action='store', dest='hpc_queue', type=str, metavar='', help='Submit jobs to the specified queue') + # Try to find the terminal size if we can + # Try/except here because the terminal size could fail w/o a display + term_cols = None + try: + term_cols = os.get_terminal_size().columns * 7/8 + except: + term_cols = 110 + pass + + # Optionally load in the environment controlled values + term_cols = int(os.getenv('MOOSE_TERM_COLS', term_cols)) + term_format = os.getenv('MOOSE_TERM_FORMAT', 'njcst') + + # Terminal options + termgroup = parser.add_argument_group('Terminal Options', 'Options for controlling the formatting of terminal output') + termgroup.add_argument('--term-cols', dest='term_cols', action='store', type=int, default=term_cols, help='The number columns to use in output') + termgroup.add_argument('--term-format', dest='term_format', action='store', type=str, default=term_format, help='The formatting to use when outputting job status') + code = True if self.code.decode() in argv: del argv[argv.index(self.code.decode())] diff --git a/python/TestHarness/tests/TestHarnessTestCase.py b/python/TestHarness/tests/TestHarnessTestCase.py index dbbb0e2c48e0..4fbfac2679b5 100644 --- a/python/TestHarness/tests/TestHarnessTestCase.py +++ b/python/TestHarness/tests/TestHarnessTestCase.py @@ -18,8 +18,7 @@ class TestHarnessTestCase(unittest.TestCase): """ def runExceptionTests(self, *args): - os.environ['MOOSE_TERM_FORMAT'] = 'njCst' - cmd = ['./run_tests'] + list(args) + cmd = ['./run_tests'] + list(args) + ['--term-format', 'njCst'] try: return subprocess.check_output(cmd, cwd=os.path.join(os.getenv('MOOSE_DIR'), 'test')) raise RuntimeError('test failed to fail') @@ -27,8 +26,7 @@ def runExceptionTests(self, *args): return err.output def runTests(self, *args): - os.environ['MOOSE_TERM_FORMAT'] = 'njCst' - cmd = ['./run_tests'] + list(args) + cmd = ['./run_tests'] + list(args) + ['--term-format', 'njCst'] return subprocess.check_output(cmd, cwd=os.path.join(os.getenv('MOOSE_DIR'), 'test')) def checkStatus(self, output, passed=0, skipped=0, pending=0, failed=0): diff --git a/python/TestHarness/tests/test_InstallType.py b/python/TestHarness/tests/test_InstallType.py index 30bb7abbc78b..1a2ae7e047cf 100644 --- a/python/TestHarness/tests/test_InstallType.py +++ b/python/TestHarness/tests/test_InstallType.py @@ -7,7 +7,7 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html -import os, sys, io +import os, io import unittest import mock import TestHarness @@ -21,8 +21,8 @@ def mocked_output(self, mocked, expect_fail, mocked_return): out = io.StringIO() with redirect_stdout(out): mocked_return.return_value=mocked - os.environ['MOOSE_TERM_FORMAT'] = 'njCst' - harness = TestHarness.TestHarness(['', '-i', 'install_type', '-c'], MOOSE_DIR) + cmd = ['', '-i', 'install_type', '-c', '--term-format', 'njCst'] + harness = TestHarness.TestHarness(cmd, MOOSE_DIR) if expect_fail: with self.assertRaises(SystemExit): harness.findAndRunTests() @@ -35,7 +35,7 @@ def testInstalled(self): Test which only runs if binary is installed """ out = self.mocked_output(set(['ALL', 'INSTALLED']), False) - self.assertRegex(out, r'tests\/test_harness\.in_tree_type[\s.]+\[test requires "IN_TREE" binary\]\s+SKIP') + self.assertRegex(out, r'tests\/test_harness\.in_tree_type[\s.]+\[TEST REQUIRES "IN_TREE" BINARY\]\s+SKIP') self.assertRegex(out, r'tests\/test_harness\.installed_type[\s.]+OK') self.assertRegex(out, r'tests\/test_harness\.all_type[\s.]+OK') @@ -45,5 +45,5 @@ def testInTree(self): """ out = self.mocked_output(set(['ALL', 'IN_TREE']), False) self.assertRegex(out, r'tests\/test_harness\.in_tree_type[\s.]+OK') - self.assertRegex(out, r'tests\/test_harness\.installed_type[\s.]+\[test requires "INSTALLED" binary\]\s+SKIP') + self.assertRegex(out, r'tests\/test_harness\.installed_type[\s.]+\[TEST REQUIRES "INSTALLED" BINARY\]\s+SKIP') self.assertRegex(out, r'tests\/test_harness\.all_type[\s.]+OK') diff --git a/python/TestHarness/tests/test_MachineType.py b/python/TestHarness/tests/test_MachineType.py index dea14ae2506e..10508d8a0c06 100644 --- a/python/TestHarness/tests/test_MachineType.py +++ b/python/TestHarness/tests/test_MachineType.py @@ -7,7 +7,7 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html -import os, sys, io +import os, io import unittest import mock import TestHarness @@ -21,8 +21,8 @@ def mocked_output(self, mocked, expect_fail, mocked_return): out = io.StringIO() with redirect_stdout(out): mocked_return.return_value=mocked - os.environ['MOOSE_TERM_FORMAT'] = 'njCst' - harness = TestHarness.TestHarness(['', '-i', 'always_ok', '-c'], MOOSE_DIR) + cmd = ['', '-i', 'always_ok', '-c', '--term-format', 'njCst'] + harness = TestHarness.TestHarness(cmd, MOOSE_DIR) if expect_fail: with self.assertRaises(SystemExit): harness.findAndRunTests() diff --git a/python/TestHarness/util.py b/python/TestHarness/util.py index 1dd67b6433b8..523e9d41409a 100644 --- a/python/TestHarness/util.py +++ b/python/TestHarness/util.py @@ -15,15 +15,6 @@ import yaml import sys -DEFAULT_TERM_COLS = None -try: - DEFAULT_TERM_COLS = os.get_terminal_size().columns * 5/6 -except: - DEFAULT_TERM_COLS = 110 - pass -TERM_COLS = int(os.getenv('MOOSE_TERM_COLS', DEFAULT_TERM_COLS)) -TERM_FORMAT = os.getenv('MOOSE_TERM_FORMAT', 'njcst') - MOOSE_OPTIONS = { 'ad_size' : { 're_option' : r'#define\s+MOOSE_AD_MAX_DOFS_PER_ELEM\s+(\d+)', 'default' : '64' @@ -268,7 +259,7 @@ def formatStatusMessage(job, status, message, options): # 2) the color parameter is False. def formatResult(job, options, result='', color=True, **kwargs): # Support only one instance of a format identifier, but obey the order - terminal_format = list(OrderedDict.fromkeys(list(TERM_FORMAT))) + terminal_format = list(OrderedDict.fromkeys(list(options.term_format))) status, message, message_color, exit_code, sort_value = job.getJointStatus() color_opts = {'code' : options.code, 'colored' : options.colored} @@ -328,12 +319,12 @@ def formatResult(job, options, result='', color=True, **kwargs): character_count = resultCharacterCount(formatted_results) + len(f_caveats) + 1 # If caveats are the last items the user wants printed, or -e (extra_info) is - # called, allow caveats to consume available character count beyond TERM_COLS. + # called, allow caveats to consume available character count beyond options.term_cols. # Else, we trim caveats: if terminal_format[-1].lower() != 'c' \ and not options.extra_info \ - and character_count > TERM_COLS: - over_by_amount = character_count - TERM_COLS + and character_count > options.term_cols: + over_by_amount = character_count - options.term_cols f_caveats = '[' + caveats[:len(caveats) - (over_by_amount + 3)] + '...]' formatCase(caveat_index, (f_caveats, caveat_color), formatted_results) @@ -343,9 +334,9 @@ def formatResult(job, options, result='', color=True, **kwargs): j_dot = None # +1 space created later by join character_count = resultCharacterCount(formatted_results) + 1 - if character_count < TERM_COLS: - j_dot = ('.'*max(0, (TERM_COLS - character_count)), 'GREY') - elif character_count == TERM_COLS: + if character_count < options.term_cols: + j_dot = ('.'*max(0, (options.term_cols - character_count)), 'GREY') + elif character_count == options.term_cols: j_dot = ('', 'GREY') formatCase(justification_index, j_dot, formatted_results) From 92328c5487d3d805b654066cd87e068ad77f9585 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 20 May 2024 22:08:13 -0600 Subject: [PATCH 105/243] Also escape ${ --- python/TestHarness/schedulers/RunHPC.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 2912d96d473e..2e6c8234d7cf 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -259,6 +259,7 @@ def submitJob(self, job): command = tester.getCommand(options) command = command.replace('\n', ' ') command = command.replace("'", "\'\\'\'") + command = command.replace('${', '\${') # Special logic for when we're running with apptainer, in which case # we need to manipulate the command like such From d3c4553ccdaf063472a1738a27e1320a728c8e4d Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 20 May 2024 21:51:54 -0600 Subject: [PATCH 106/243] Begin work on container isolation: --no-home and specific bindpaths --- python/TestHarness/TestHarness.py | 2 ++ python/TestHarness/schedulers/RunHPC.py | 24 +++++++++++++++++++----- python/TestHarness/schedulers/hpc_source | 1 + 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index b82e329c62ea..df4b7efe6734 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -1065,6 +1065,8 @@ def parseCLArgs(self, argv): hpcgroup.add_argument('--hpc-pre-source', nargs=1, action="store", dest='hpc_pre_source', metavar='', help='Source specified file before launching HPC tests') hpcgroup.add_argument('--hpc-file-timeout', nargs=1, type=int, action='store', dest='hpc_file_timeout', default=120, help='The time in seconds to wait for HPC output') hpcgroup.add_argument('--hpc-place', nargs=1, action='store', dest='hpc_place', choices=['free', 'pack', 'scatter'], default='free', help='The default placement method for HPC jobs') + hpcgroup.add_argument('--hpc-apptainer-bindpath', nargs=1, action='store', type=str, dest='hpc_apptainer_bindpath', help='Sets the apptainer bindpath for HPC jobs') + hpcgroup.add_argument('--hpc-apptainer-no-home', action='store_true', dest='hpc_apptainer_no_home', help='Passes --no-home to apptainer for HPC jobs') hpcgroup.add_argument('--pbs-project', nargs=1, action='store', dest='pbs_project', type=str, default='moose', metavar='', help='Identify your job(s) with this project (default: %(default)s)') hpcgroup.add_argument('--pbs-queue', nargs=1, action='store', dest='hpc_queue', type=str, metavar='', help='Submit jobs to the specified queue') diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 2e6c8234d7cf..d7f8b4713a34 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -8,7 +8,7 @@ #* https://www.gnu.org/licenses/lgpl-2.1.html from RunParallel import RunParallel -import threading, os, re, sys, datetime +import threading, os, re, sys, datetime, shlex import paramiko from multiprocessing.pool import ThreadPool from timeit import default_timer as clock @@ -85,6 +85,13 @@ def __init__(self, harness, params): default_pre_source = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'hpc_source') self.options.hpc_pre_source = default_pre_source print(f'INFO: Setting --hpc-pre-source={default_pre_source}') + else: + if self.options.hpc_apptainer_bindpath: + print('ERROR: --hpc-apptainer-bindpath is unused when not executing with apptainer') + sys.exit(1) + if self.options.hpc_apptainer_no_home: + print('ERROR: --hpc-apptainer-no-home is unused when not executing with apptainer') + sys.exit(1) if self.options.hpc_pre_source and not os.path.exists(self.options.hpc_pre_source): print(f'ERROR: --hpc-pre-source path {self.options.hpc_pre_source} does not exist') @@ -158,7 +165,7 @@ def _getSSHClient(self, reconnect=False): client_and_host = self.ssh_clients.get(process) if client_and_host is None: - raise Exception('Failed to connect to SSH host(s) ', ', '.join(self.ssh_hosts)) + raise Exception(f'Failed to connect to SSH host(s) {", ".join(self.ssh_hosts)}') return client_and_host def _callSSH(self, command): @@ -283,10 +290,17 @@ def submitJob(self, job): job_data.command_printable = command_prefix # The root filesystem path that we're in so that we can be sure to bind - # it into the container - root_path = os.path.abspath(tester.getTestDir()).split(os.path.sep)[1] + # it into the container, if not already set + if self.options.hpc_apptainer_bindpath: + bindpath = self.options.hpc_apptainer_bindpath + else: + bindpath = '/' + os.path.abspath(tester.getTestDir()).split(os.path.sep)[1] # The apptainer command that will get sandwiched in the middle - apptainer_command = f'apptainer exec -B /{root_path} {APPTAINER_CONTAINER}' + apptainer_command = ['apptainer', 'exec', '-B', bindpath] + if self.options.hpc_apptainer_no_home: + apptainer_command.append('--no-home') + apptainer_command.append(APPTAINER_CONTAINER) + apptainer_command = shlex.join(apptainer_command) # Append the apptainer command along with the command to be ran job_data.command += f"{apptainer_command} '{command}'" job_data.command_printable += f"{apptainer_command} \'\\'\'{command}\'\\'\'" diff --git a/python/TestHarness/schedulers/hpc_source b/python/TestHarness/schedulers/hpc_source index 9d7d28becf61..36244fdca169 100644 --- a/python/TestHarness/schedulers/hpc_source +++ b/python/TestHarness/schedulers/hpc_source @@ -1,2 +1,3 @@ +export MOOSE_DEV_CONTAINER_MINIMAL_BINDPATH=1 module load use.moose moose-dev-container export MV2_ENABLE_AFFINITY=0 From ef34d4e1a098d86b17a532b1acd25577672ffe02 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 21 May 2024 07:57:02 -0600 Subject: [PATCH 107/243] Try to catch this --- python/TestHarness/schedulers/Scheduler.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index 7d54cfb435be..47480e6925d7 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -364,7 +364,12 @@ def handleJobStatus(self, job, caveats=None): Whether or not it actually gets reported... is not so intuitive. """ - self.status_pool.apply_async(self.jobStatus, (job,caveats,)) + # This try catch will get rid of the "Pool not running" errors + # when we're forced to exit + try: + self.status_pool.apply_async(self.jobStatus, (job,caveats,)) + except ValueError: + pass def jobStatus(self, job, caveats): """ From 06b05919789cf96e130e975a344ee341d7a37000 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 21 May 2024 10:25:41 -0600 Subject: [PATCH 108/243] Move running to the Job --- python/TestHarness/runners/HPCRunner.py | 9 +- python/TestHarness/runners/Runner.py | 33 ++-- .../TestHarness/runners/SubprocessRunner.py | 2 +- python/TestHarness/schedulers/Job.py | 152 ++++++++++++++---- python/TestHarness/schedulers/RunParallel.py | 49 +----- python/TestHarness/schedulers/Scheduler.py | 2 +- python/TestHarness/testers/AnalyzeJacobian.py | 10 +- python/TestHarness/testers/CSVDiff.py | 4 +- .../testers/CSVValidationTester.py | 5 +- python/TestHarness/testers/CheckFiles.py | 4 +- python/TestHarness/testers/Exodiff.py | 4 +- python/TestHarness/testers/ImageDiff.py | 4 +- .../testers/PetscJacobianTester.py | 12 +- python/TestHarness/testers/RunApp.py | 38 +++-- python/TestHarness/testers/RunCommand.py | 10 +- python/TestHarness/testers/RunException.py | 10 +- python/TestHarness/testers/SchemaDiff.py | 6 +- python/TestHarness/testers/SignalTester.py | 4 +- python/TestHarness/testers/Tester.py | 125 ++++---------- python/TestHarness/testers/XMLDiff.py | 3 - 20 files changed, 239 insertions(+), 247 deletions(-) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index 5ca57bcfaaee..1082479d8699 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -83,7 +83,7 @@ def wait(self, timer): # Determine the output files that we need to wait for to be complete wait_files = set([output_file]) # Output files needed by the Tester, only if it says we should - if tester.mustOutputExist(): + if tester.mustOutputExist(self.exit_code): for file in tester.getOutputFiles(self.options): wait_files.add(os.path.join(tester.getTestDir(), file)) # The files that we can read, but are incomplete (no terminator) @@ -138,13 +138,6 @@ def trySetOutput(self, required=False): Returns whether or not the output was set. """ - # self.output is originally set to None so that other objects - # cannot attempt to write to it before we have at least obtained - # some object, hence why we need to set it here because we're - # signaling that we're ready for output - if self.output is None: - self.output = '' - # Whether or not we actually set it did_set = False diff --git a/python/TestHarness/runners/Runner.py b/python/TestHarness/runners/Runner.py index a93c8b4db27e..5621fe6e84aa 100644 --- a/python/TestHarness/runners/Runner.py +++ b/python/TestHarness/runners/Runner.py @@ -7,6 +7,8 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html +import json + class Runner: """ Base class for running a process via a command. @@ -23,7 +25,7 @@ def __init__(self, job, options): # The job's exit code, should be set after wait() self.exit_code = None # The output the job produced; to be filled in wait() - self.output = None + self.output = '' def spawn(self, timer): """ @@ -54,10 +56,27 @@ def kill(self): def getOutput(self): """ Gets the combined output of the process. - - Should be overridden. """ - return self.output + output = self.output + + # Check for invalid unicode in output + try: + json.dumps(output) + except UnicodeDecodeError: + # Convert invalid output to something json can handle + output = output.decode('utf-8','replace').encode('ascii', 'replace') + # Alert the user that output has invalid characters + self.job.addCaveats('invalid characters in stdout') + + # Remove NULL output and fail if it exists + null_chars = ['\0', '\x00'] + for null_char in null_chars: + if null_char in output: + output = output.replace(null_char, 'NULL') + if not self.job.isFail(): + self.job.setStatus(self.job.error, 'NULL characters in output') + + return output def getExitCode(self): """ @@ -65,12 +84,6 @@ def getExitCode(self): """ return self.exit_code - def isOutputReady(self): - """ - Whether or not the output is ready for reading. - """ - return self.output is not None - def sendSignal(self, signal): """ Sends a signal to the process. diff --git a/python/TestHarness/runners/SubprocessRunner.py b/python/TestHarness/runners/SubprocessRunner.py index a7c2ae76f087..3d8b4c790c1a 100644 --- a/python/TestHarness/runners/SubprocessRunner.py +++ b/python/TestHarness/runners/SubprocessRunner.py @@ -61,7 +61,7 @@ def spawn(self, timer): process_env = os.environ.copy() # Don't clobber state - process_env['OMPI_MCA_orte_tmpdir_base'] = tester.getTempDirectory().name + process_env['OMPI_MCA_orte_tmpdir_base'] = self.job.getTempDirectory().name # Allow oversubscription for hosts that don't have a hostfile process_env['PRTE_MCA_rmaps_default_mapping_policy'] = ':oversubscribe' diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 9e2b28ab5b76..b1fb8b33a6ca 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -11,6 +11,9 @@ from timeit import default_timer as clock from TestHarness.StatusSystem import StatusSystem from TestHarness.FileChecker import FileChecker +from TestHarness.runners.Runner import Runner +from tempfile import TemporaryDirectory +import traceback class Timer(object): """ @@ -103,14 +106,55 @@ def __init__(self, tester, job_dag, options): # the next time report statuses self.force_report_status = False + # The object that'll actually do the run + self._runner = None + + # Any additional output produced by the Job (not from the Tester or Runner) + self.output = '' + + self.cached_output = None + + # A temp directory for this Job, if requested + self.tmp_dir = None + + def __del__(self): + # Do any cleaning that we can (removes the temp dir for now if it exists) + self.cleanup() + def getID(self): """Returns the unique ID for the job""" return self.id + def setRunner(self, runner: Runner): + """Sets the underlying Runner object that will run the command""" + self._runner = runner + def getLock(self): """ Get the lock associated with this job """ return self.__j_lock + def getTempDirectory(self): + """ + Gets a shared temp directory that will be cleaned up for this Tester + """ + if self.tmp_dir is None: + self.tmp_dir = TemporaryDirectory(prefix='tester_') + return self.tmp_dir + + def cleanup(self): + """ + Entry point for doing any cleaning if necessary. + + Currently just cleans up the temp directory + """ + if self.tmp_dir is not None: + # Don't let this fail + try: + self.tmp_dir.cleanup() + except: + pass + self.tmp_dir = None + def getUpstreams(self): """ Return a list of all the jobs that needed to be completed before this job """ dag = self.getDAG() @@ -232,9 +276,10 @@ def run(self): A blocking method to handle the exit status of the process object while keeping track of the time the process was active. When the process exits, read the output and close the file. """ + tester = self.__tester # Do not execute app, but allow processResults to commence - if not self.__tester.shouldExecute(): + if not tester.shouldExecute(): return if self.options.pedantic_checks and self.canParallel(): @@ -243,32 +288,72 @@ def run(self): self.addCaveats('pedantic check') time.sleep(1) - self.__tester.prepare(self.options) + tester.prepare(self.options) + + # Verify that the working directory is available right before we execute + if not os.path.exists(tester.getTestDir()): + self.setStatus(self.error, 'WORKING DIRECTORY NOT FOUND') + return + # Getting the command can also cause a failure, so try that + tester.getCommand(self.options) + if tester.isError(): + return - self.__start_time = clock() self.timer.reset() - self.__tester.run(self, self.options, self.timer) + + self.__start_time = clock() + + # Helper for trying and catching + def try_catch(do, exception_name): + try: + do() + except: + self.cleanup() + self.setStatus(self.error, f'{exception_name} EXCEPTION') + self.output += '\n\nPython exception encountered:\n' + traceback.format_exc() + return False + return True + + # Spawn the process + spawn = lambda: self._runner.spawn(self.timer) + if not try_catch(spawn, 'RUNNER SPAWN'): + return + + # Entry point for testers to do other things + post_spawn = lambda: tester.postSpawn(self._runner) + if not try_catch(post_spawn, 'TESTER POST SPAWN'): + return + + # And wait for it to complete + wait = lambda: self._runner.wait(self.timer) + if not try_catch(wait, 'RUNNER WAIT'): + return + self.__start_time = self.timer.starts[0] self.__end_time = self.timer.ends[-1] - self.__joined_out = self.__tester.getOutput() - - # Remove NULL output and fail if it exists - if self.__joined_out: - null_chars = ['\0', '\x00'] - for null_char in null_chars: - if null_char in self.__joined_out: - self.__joined_out = self.__joined_out.replace(null_char, 'NULL') - if not self.isFail(): - self.setStatus(self.error, f'NULL characters in output') if self.options.pedantic_checks and self.canParallel(): # Check if the files we checked on earlier were modified. self.fileChecker.get_all_files(self, self.fileChecker.getNewTimes()) self.modifiedFiles = self.fileChecker.check_changes(self.fileChecker.getOriginalTimes(), self.fileChecker.getNewTimes()) + # Allow derived proccessResults to process the output and set a failing status (if it failed) + runner_output = self._runner.getOutput() + exit_code = self._runner.getExitCode() + run_tester = lambda: tester.run(self.options, exit_code, runner_output) + try_catch(run_tester, 'TESTER PROCESS') + + # Run cleanup now that we're done + self.cleanup() + def killProcess(self): """ Kill remaining process that may be running """ - self.__tester.killCommand() + if self._runner: + try: + self._runner.kill() + except: + pass + self.cleanup() def getStartTime(self): """ Return the time the process started """ @@ -279,8 +364,22 @@ def getEndTime(self): return self.__end_time def getOutput(self): - """ Return the contents of output """ - return self.__joined_out if self.__joined_out else '' + """ Return the combined contents of output """ + if self.cached_output: + return self.cached_output + + output = '' + if self._runner and self._runner.getOutput(): + output += self._runner.getOutput() + if self.__tester and self.__tester.getOutput(): + output += self.__tester.getOutput() + if self.output: + output += self.output + return output + + def getRunner(self): + """ Gets the Runner that actually runs the command """ + return self._runner def getOutputFile(self): """ Return the output file path """ @@ -297,23 +396,8 @@ def getOutputFile(self): 'txt'])) return os.path.join(output_dir, output_file) - def setOutput(self, output, force=False): - """ Method to allow schedulers to overwrite the output if certain conditions are met """ - if not self.__tester.isOutputReady() and not force: - return - - # Check for invalid unicode in output - try: - json.dumps(output) - - except UnicodeDecodeError: - # convert invalid output to something json can handle - output = output.decode('utf-8','replace').encode('ascii', 'replace') - - # Alert the user that output has invalid characters - self.addCaveats('invalid characters in stdout') - - self.__joined_out = output + def appendOutput(self, output): + self.output += output def getActiveTime(self): """ Return active time """ diff --git a/python/TestHarness/schedulers/RunParallel.py b/python/TestHarness/schedulers/RunParallel.py index 8dd0815f3aab..82867bac45b5 100644 --- a/python/TestHarness/schedulers/RunParallel.py +++ b/python/TestHarness/schedulers/RunParallel.py @@ -30,11 +30,11 @@ def __init__(self, harness, params): def run(self, job): """ Run a tester command """ - tester = job.getTester() - # Build and set the runner that will actually run the commands # This is abstracted away so we can support local runners and PBS/slurm runners - tester.setRunner(self.buildRunner(job, self.options)) + job.setRunner(self.buildRunner(job, self.options)) + + tester = job.getTester() # Do not execute app, and do not processResults if self.options.dry_run: @@ -47,7 +47,7 @@ def run(self, job): if caveats: tester.addCaveats(caveats) job.setPreviousTime(job_results['TIMING']) - job.setOutput(job_results['OUTPUT'], force=True) + job.cached_output = job_results['OUTPUT'] return output = '' @@ -58,49 +58,16 @@ def run(self, job): # Launch and wait for the command to finish job.run() - # Was this job already considered finished? (Timeout, Crash, etc) - if job.isFinished(): - tester.cleanup() - return - - # Allow derived proccessResults to process the output and set a failing status (if it failed) - job_output = job.getOutput() - output = tester.processResults(tester.getMooseDir(), self.options, job_output) - - # If the tester requested to be skipped at the last minute, report that. - if tester.isSkip(): - output += '\n' + "#"*80 + '\nTester skipped, reason: ' + tester.getStatusMessage() + '\n' - elif tester.isFail(): - output += '\n' + "#"*80 + '\nTester failed, reason: ' + tester.getStatusMessage() + '\n' - # If the tester has not yet failed, append additional information to output - else: - # Read the output either from the temporary file or redirected files - if tester.hasRedirectedOutput(self.options): - redirected_output = util.getOutputFromFiles(tester, self.options) - output += redirected_output - - # If we asked for redirected output but none was found, we'll call that a failure - if redirected_output == '': - tester.setStatus(tester.fail, 'FILE TIMEOUT') - output += '\n' + "#"*80 + '\nTester failed, reason: ' + tester.getStatusMessage() + '\n' - + # Set the successful message + if not tester.isSkip() and not tester.isFail(): self.setSuccessfulMessage(tester) except Exception: - output += 'Python exception encountered:\n\n' + traceback.format_exc() - tester.setStatus(StatusSystem().error, 'TESTER EXCEPTION') - # Forcefully set the output here because it might have not initialized - # because the job might not have even run - job.setOutput(output, force=True) - - # Clean up now that we're done - tester.cleanup() + self.output += 'Python exception encountered:\n\n' + traceback.format_exc() + tester.setStatus(StatusSystem().error, 'JOB EXCEPTION') if job.getOutputFile(): job.addMetaData(DIRTY_FILES=[job.getOutputFile()]) - # Set testers output with modifications made above so it prints the way we want it - job.setOutput(output) - def buildRunner(self, job, options) -> Runner: """Builds the runner for a given tester diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index 47480e6925d7..a55931422516 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -498,7 +498,7 @@ def runJob(self, job, jobs): except Exception: with job.getLock(): job.setStatus(StatusSystem().error, 'JOB EXCEPTION') - job.setOutput('Encountered an exception while running Job: %s' % (traceback.format_exc())) + job.appendOutput('Encountered an exception while running Job: %s' % (traceback.format_exc())) if timeout_timer: timeout_timer.cancel() diff --git a/python/TestHarness/testers/AnalyzeJacobian.py b/python/TestHarness/testers/AnalyzeJacobian.py index 0cfe84b5a66f..9cab8a93495d 100644 --- a/python/TestHarness/testers/AnalyzeJacobian.py +++ b/python/TestHarness/testers/AnalyzeJacobian.py @@ -71,17 +71,19 @@ def getCommand(self, options): return command - def processResults(self, moose_dir, options, output): + def processResults(self, moose_dir, options, exit_code, runner_output): + output = super().processResults(moose_dir, options, exit_code, runner_output) + reason = '' specs = self.specs if specs.isValid('expect_out'): - out_ok = util.checkOutputForPattern(output, specs['expect_out']) - if (out_ok and self.getExitCode() != 0): + out_ok = util.checkOutputForPattern(runner_output, specs['expect_out']) + if (out_ok and exit_code != 0): reason = 'OUT FOUND BUT CRASH' elif (not out_ok): reason = 'NO EXPECTED OUT' if reason == '': - if self.getExitCode() != 0 : + if exit_code != 0: reason = 'CRASH' if reason != '': diff --git a/python/TestHarness/testers/CSVDiff.py b/python/TestHarness/testers/CSVDiff.py index 7d1dfbce2db5..2e34f587a43f 100644 --- a/python/TestHarness/testers/CSVDiff.py +++ b/python/TestHarness/testers/CSVDiff.py @@ -89,8 +89,8 @@ def processResultsCommand(self, moose_dir, options): return commands - def processResults(self, moose_dir, options, output): - FileTester.processResults(self, moose_dir, options, output) + def processResults(self, moose_dir, options, exit_code, runner_output): + output = super().processResults(moose_dir, options, exit_code, runner_output) if self.isFail() or self.specs['skip_checks']: return output diff --git a/python/TestHarness/testers/CSVValidationTester.py b/python/TestHarness/testers/CSVValidationTester.py index a4dcc2ebcc13..b3f8e972865c 100644 --- a/python/TestHarness/testers/CSVValidationTester.py +++ b/python/TestHarness/testers/CSVValidationTester.py @@ -108,8 +108,8 @@ def __init__(self, name, params): # formatting self.file_name_len = 40 - def processResults(self, moose_dir, options, output): - FileTester.processResults(self, moose_dir, options, output) + def processResults(self, moose_dir, options, exit_code, runner_output): + output = super().processResults(moose_dir, options, exit_code, runner_output) if self.isFail() or self.specs['skip_checks']: return output @@ -118,7 +118,6 @@ def processResults(self, moose_dir, options, output): if options.scaling and self.specs['scale_refine']: return output - output = "" # Make sure that all of the CSVDiff files are actually available for file in self.specs['csvdiff']: if not os.path.exists(os.path.join(self.getTestDir(), self.specs['gold_dir'], file)): diff --git a/python/TestHarness/testers/CheckFiles.py b/python/TestHarness/testers/CheckFiles.py index df9f1da06c69..20a11468d642 100644 --- a/python/TestHarness/testers/CheckFiles.py +++ b/python/TestHarness/testers/CheckFiles.py @@ -31,8 +31,8 @@ def __init__(self, name, params): def getOutputFiles(self, options): return self.specs['check_files'] + self.specs['check_not_exists'] - def processResults(self, moose_dir, options, output): - FileTester.processResults(self, moose_dir, options, output) + def processResults(self, moose_dir, options, exit_code, runner_output): + output = super().processResults(moose_dir, options, exit_code, runner_output) specs = self.specs diff --git a/python/TestHarness/testers/Exodiff.py b/python/TestHarness/testers/Exodiff.py index 80b9a8c20413..9cf75dd0e214 100644 --- a/python/TestHarness/testers/Exodiff.py +++ b/python/TestHarness/testers/Exodiff.py @@ -72,8 +72,8 @@ def processResultsCommand(self, moose_dir, options): return commands - def processResults(self, moose_dir, options, output): - FileTester.processResults(self, moose_dir, options, output) + def processResults(self, moose_dir, options, exit_code, runner_output): + output = super().processResults(moose_dir, options, exit_code, runner_output) if self.isFail() or self.specs['skip_checks']: return output diff --git a/python/TestHarness/testers/ImageDiff.py b/python/TestHarness/testers/ImageDiff.py index 2e3ce0a2d958..4adff301de0c 100644 --- a/python/TestHarness/testers/ImageDiff.py +++ b/python/TestHarness/testers/ImageDiff.py @@ -36,13 +36,13 @@ def __init__(self, name, params): def getOutputFiles(self, options): return self.specs['imagediff'] - def processResults(self, moose_dir, options, output): + def processResults(self, moose_dir, options, exit_code, runner_output): """ Perform image diff """ # Call base class processResults - FileTester.processResults(self, moose_dir, options, output) + output = super().processResults(moose_dir, options, exit_code, runner_output) if self.isFail(): return output diff --git a/python/TestHarness/testers/PetscJacobianTester.py b/python/TestHarness/testers/PetscJacobianTester.py index 684d8992e238..5a5326c23ef0 100644 --- a/python/TestHarness/testers/PetscJacobianTester.py +++ b/python/TestHarness/testers/PetscJacobianTester.py @@ -95,17 +95,19 @@ def __compare(self, value, threshold): else: return False - def processResults(self, moose_dir, options, output): + def processResults(self, moose_dir, options, exit_code, runner_output): + output = super().processResults(moose_dir, options, exit_code, runner_output) + if self.old_petsc: if self.specs['state'].lower() == 'user': m = re.search("Norm of matrix ratio (\S+?),? difference (\S+) \(user-defined state\)", - output, re.MULTILINE | re.DOTALL); + runner_output, re.MULTILINE | re.DOTALL); elif self.specs['state'].lower() == 'const_positive': m = re.search("Norm of matrix ratio (\S+?),? difference (\S+) \(constant state 1\.0\)", - output, re.MULTILINE | re.DOTALL); + runner_output, re.MULTILINE | re.DOTALL); elif self.specs['state'].lower() == 'const_negative': m = re.search("Norm of matrix ratio (\S+?),? difference (\S+) \(constant state -1\.0\)", - output, re.MULTILINE | re.DOTALL); + runner_output, re.MULTILINE | re.DOTALL); else: self.setStatus("state must be either 'user', const_positive', or 'const_negative'", self.bucket_fail) @@ -122,7 +124,7 @@ def processResults(self, moose_dir, options, output): else: matches = re.finditer("\|\|J - Jfd\|\|_F/\|\|J\|\|_F\s?=?\s?(\S+), \|\|J - Jfd\|\|_F\s?=?\s?(\S+)", - output, re.MULTILINE | re.DOTALL) + runner_output, re.MULTILINE | re.DOTALL) reason = 'EXPECTED OUTPUT NOT FOUND' for match in matches: diff --git a/python/TestHarness/testers/RunApp.py b/python/TestHarness/testers/RunApp.py index f0059dc25a0d..0148a77f4464 100644 --- a/python/TestHarness/testers/RunApp.py +++ b/python/TestHarness/testers/RunApp.py @@ -166,7 +166,7 @@ def getCommand(self, options): # Check for built application if shutil.which(specs['executable']) is None: - self.setStatus(self.fail, 'APPLICATION NOT FOUND') + self.setStatus(self.error, 'APPLICATION NOT FOUND') # If no_additional_cli_args is set to True, return early with a simplified command line ignoring # all other TestHarness supplied options. @@ -239,7 +239,7 @@ def getCommand(self, options): return command - def testFileOutput(self, moose_dir, options, output): + def testFileOutput(self, moose_dir, options, runner_output): """ Set a failure status for expressions found in output """ reason = '' errors = '' @@ -255,16 +255,13 @@ def testFileOutput(self, moose_dir, options, output): custom_module = importlib.util.module_from_spec(custom_mod_spec) sys.modules['custom_module'] = custom_module custom_mod_spec.loader.exec_module(custom_module) - if custom_module.custom_evaluation(output): + if custom_module.custom_evaluation(runner_output): return errors else: errors += "#"*80 + "\n\n" + "Custom evaluation failed.\n" self.setStatus(self.fail, "CUSTOM EVAL FAILED") return errors - - - params_and_msgs = {'expect_err': {'error_missing': True, 'modes': ['ALL'], @@ -291,10 +288,10 @@ def testFileOutput(self, moose_dir, options, output): if specs.isValid(param) and (options.method in attr['modes'] or attr['modes'] == ['ALL']): match_type = "" if specs['match_literal']: - have_expected_out = util.checkOutputForLiteral(output, specs[param]) + have_expected_out = util.checkOutputForLiteral(runner_output, specs[param]) match_type = 'literal' else: - have_expected_out = util.checkOutputForPattern(output, specs[param]) + have_expected_out = util.checkOutputForPattern(runner_output, specs[param]) match_type = 'pattern' # Exclusive OR test @@ -308,7 +305,7 @@ def testFileOutput(self, moose_dir, options, output): return errors - def testExitCodes(self, moose_dir, options, output): + def testExitCodes(self, moose_dir, options, exit_code, runner_output): # Don't do anything if we already have a status set reason = '' if self.isNoStatus(): @@ -316,25 +313,25 @@ def testExitCodes(self, moose_dir, options, output): # We won't pay attention to the ERROR strings if EXPECT_ERR is set (from the derived class) # since a message to standard error might actually be a real error. This case should be handled # in the derived class. - if options.valgrind_mode == '' and not specs.isValid('expect_err') and len( [x for x in filter( lambda x: x in output, specs['errors'] )] ) > 0: + if options.valgrind_mode == '' and not specs.isValid('expect_err') and len( [x for x in filter( lambda x: x in runner_output, specs['errors'] )] ) > 0: reason = 'ERRMSG' - elif self.getExitCode() == 0 and specs['should_crash'] == True: + elif exit_code == 0 and specs['should_crash'] == True: reason = 'NO CRASH' - elif self.getExitCode() != 0 and specs['should_crash'] == False and self.shouldExecute(): + elif exit_code != 0 and specs['should_crash'] == False and self.shouldExecute(): # Let's look at the error code to see if we can perhaps further split this out later with a post exam reason = 'CRASH' # Valgrind runs - elif self.getExitCode() == 0 and self.shouldExecute() and options.valgrind_mode != '' and 'ERROR SUMMARY: 0 errors' not in output: + elif exit_code == 0 and self.shouldExecute() and options.valgrind_mode != '' and 'ERROR SUMMARY: 0 errors' not in runner_output: reason = 'MEMORY ERROR' if reason != '': self.setStatus(self.fail, str(reason)) - return "\n\nExit Code: " + str(self.getExitCode()) + return "\n\nExit Code: " + str(exit_code) # Return anything extra here that we want to tack onto the Output for when it gets printed later return '' - def processResults(self, moose_dir, options, output): + def processResults(self, moose_dir, options, exit_code, runner_output): """ Wrapper method for testFileOutput. @@ -348,15 +345,16 @@ def processResults(self, moose_dir, options, output): # TODO: because RunParallel is now setting every successful status message, refactor testFileOutput and processResults. """ - output += self.testFileOutput(moose_dir, options, output) - output += self.testExitCodes(moose_dir, options, output) + output = '' + output += self.testFileOutput(moose_dir, options, runner_output) + output += self.testExitCodes(moose_dir, options, exit_code, output) return output - def mustOutputExist(self): + def mustOutputExist(self, exit_code): if self.specs['should_crash']: - return self.getExitCode() != 0 - return self.getExitCode() == 0 + return exit_code != 0 + return exit_code == 0 def needFullOutput(self, options): # We need the full output when we're trying to read from said output diff --git a/python/TestHarness/testers/RunCommand.py b/python/TestHarness/testers/RunCommand.py index ff5e888c18b2..31fb1df45fe7 100644 --- a/python/TestHarness/testers/RunCommand.py +++ b/python/TestHarness/testers/RunCommand.py @@ -26,10 +26,10 @@ def getCommand(self, options): # Create the command line string to run return self.command - def processResults(self, moose_dir, options, output): - if self.getExitCode() == 77 : + def processResults(self, moose_dir, options, exit_code, runner_output): + if exit_code == 77 : self.setStatus(self.skip) - elif self.getExitCode() != 0 : - self.setStatus(self.fail, 'CODE %d' % self.getExitCode()) + elif exit_code != 0: + self.setStatus(self.fail, 'CODE %d' % exit_code) - return output + return '' diff --git a/python/TestHarness/testers/RunException.py b/python/TestHarness/testers/RunException.py index 2855489cd422..da627ab494d2 100644 --- a/python/TestHarness/testers/RunException.py +++ b/python/TestHarness/testers/RunException.py @@ -63,14 +63,14 @@ def getProcs(self, options): return 1 return procs - def processResults(self, moose_dir, options, output): + def processResults(self, moose_dir, options, exit_code, runner_output): + output = '' + # Exceptions are written to stderr, which can be interleaved so we normally redirect these # separate files. Here we must gather those file outputs before processing if self.hasRedirectedOutput(options): - redirected_output = util.getOutputFromFiles(self, options) - output += redirected_output + runner_output = util.getOutputFromFiles(self, options) - output += self.testFileOutput(moose_dir, options, output) - output += self.testExitCodes(moose_dir, options, output) + output += super().processResults(moose_dir, options, exit_code, runner_output) return output diff --git a/python/TestHarness/testers/SchemaDiff.py b/python/TestHarness/testers/SchemaDiff.py index 8bd5524b2e8b..0b5fc7c9be83 100644 --- a/python/TestHarness/testers/SchemaDiff.py +++ b/python/TestHarness/testers/SchemaDiff.py @@ -43,9 +43,9 @@ def prepare(self, options): if self.specs['delete_output_before_running'] == True: util.deleteFilesAndFolders(self.getTestDir(), self.getOutputFiles(options)) - def processResults(self, moose_dir, options, output): - output += self.testFileOutput(moose_dir, options, output) - output += self.testExitCodes(moose_dir, options, output) + def processResults(self, moose_dir, options, exit_code, runner_output): + output = super().processResults(moose_dir, options, exit_code, runner_output) + specs = self.specs if self.isFail() or specs['skip_checks']: diff --git a/python/TestHarness/testers/SignalTester.py b/python/TestHarness/testers/SignalTester.py index 4338b22ebd15..83825b2fca14 100644 --- a/python/TestHarness/testers/SignalTester.py +++ b/python/TestHarness/testers/SignalTester.py @@ -48,5 +48,5 @@ def checkRunnable(self, options): return super().checkRunnable(options) - def postSpawn(self): - self._runner.sendSignal(self.signal) + def postSpawn(self, runner): + runner.sendSignal(self.signal) diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index 7492c50a7b8d..ae27bed41ea9 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -11,9 +11,7 @@ import mooseutils from TestHarness import util from TestHarness.StatusSystem import StatusSystem -from TestHarness.runners.Runner import Runner from FactorySystem.MooseObject import MooseObject -from tempfile import TemporaryDirectory from pathlib import Path class Tester(MooseObject): @@ -119,10 +117,6 @@ def validParams(): return params - def __del__(self): - # Do any cleaning that we can (removes the temp dir for now if it exists) - self.cleanup() - # This is what will be checked for when we look for valid testers IS_TESTER = True @@ -158,57 +152,21 @@ def __init__(self, name, params): self.fail = self.test_status.fail self.diff = self.test_status.diff self.deleted = self.test_status.deleted + self.error = self.test_status.error self.__failed_statuses = self.test_status.getFailingStatuses() self.__skipped_statuses = [self.skip, self.silent] - # A temp directory for this Tester, if requested - self.tmp_dir = None - - # The object that'll actually do the run - self._runner = None - # The command that we actually ended up running; this may change # depending on the runner which might inject something self.command_ran = None - def getTempDirectory(self): - """ - Gets a shared temp directory that will be cleaned up for this Tester - """ - if self.tmp_dir is None: - self.tmp_dir = TemporaryDirectory(prefix='tester_') - return self.tmp_dir - - def cleanup(self): - """ - Entry point for doing any cleaning if necessary. - - Currently just cleans up the temp directory - """ - if self.tmp_dir is not None: - # Don't let this fail - try: - self.tmp_dir.cleanup() - except: - pass - self.tmp_dir = None - - def setRunner(self, runner: Runner): - """Sets the underlying Runner object that will run the command""" - self._runner = runner + # The tester output + self.output = '' def getOutput(self) -> str: - """Return the combined contents of stdout and stderr of the command ran""" - return self._runner.getOutput() - - def isOutputReady(self) -> bool: - """Returns whether or not the output is ready for reading""" - return self._runner is not None and self._runner.isOutputReady() - - def getExitCode(self) -> int: - """Gets the exit code of the command that was ran""" - return self._runner.getExitCode() + """Return the Tester output""" + return self.output def getStatus(self): return self.test_status.getStatus() @@ -254,6 +212,8 @@ def isDiff(self): return self.getStatus() == self.diff def isDeleted(self): return self.getStatus() == self.deleted + def isError(self): + return self.getStatus() == self.error def getTestName(self): """ return test name """ @@ -431,50 +391,7 @@ def getCommandRan(self): """ return self.command_ran - def killCommand(self): - """ - Kills any currently executing process started by the runCommand method. - """ - return self._runner.kill() - - # Try to clean up anything else that we can - self.cleanup() - - def run(self, job, options, timer): - """ - This is a method that is the tester's main execution code. Subclasses can override this - method with custom code relevant to their specific testing needs. By default this method - calls runCommand. runCommand is provided as a helper for running (external) subprocesses - as part of the tester's execution and should be the *only* way subprocesses are executed - if needed. The run method is responsible to call the start+stop methods on timer to record - the time taken to run the actual test. start+stop can be called multiple times. - """ - # Verify that the working directory is available right before we execute - if not os.path.exists(self.getTestDir()): - self.setStatus(self.fail, 'WORKING DIRECTORY NOT FOUND') - # Getting the command can also cause a failure, so try that - self.getCommand(options) - - # If we've failed already, nothing to do here - if job.isFail(): - # Timers must be used since they are directly indexed in the Job class - timer.start() - timer.stop() - return - - # Spawn the process - try: - self._runner.spawn(timer) - except Exception as e: - raise Exception('Failed to spawn process') from e - - # Entry point for testers to do other things - self.postSpawn() - - # And wait for it to complete - self._runner.wait(timer) - - def postSpawn(self): + def postSpawn(self, runner): """ Entry point for after the process has been spawned """ @@ -484,7 +401,7 @@ def processResultsCommand(self, moose_dir, options): """ method to return the commands (list) used for processing results """ return [] - def processResults(self, moose_dir, options, output): + def processResults(self, moose_dir, options, exit_code, runner_output): """ method to process the results of a finished tester """ return @@ -514,7 +431,7 @@ def clearCaveats(self): self.__caveats = set([]) return self.getCaveats() - def mustOutputExist(self): + def mustOutputExist(self, exit_code): """ Whether or not we should check for the output once it has ran We need this because the PBS/slurm Runner objects, which use @@ -522,7 +439,7 @@ def mustOutputExist(self): on the machine that submitted the jobs. A good example is RunException, where we should only look for output when we get a nonzero return code.""" - return self.getExitCode() == 0 + return exit_code == 0 # need something that will tell us if we should try to read the result @@ -839,3 +756,23 @@ def needFullOutput(self, options): when we're trying to read something from the output. """ return False + + def run(self, options, exit_code, runner_output): + self.output = self.processResults(self.getMooseDir(), options, exit_code, runner_output) + + # If the tester requested to be skipped at the last minute, report that. + if self.isSkip(): + self.output += '\n' + "#"*80 + '\nTester skipped, reason: ' + self.getStatusMessage() + '\n' + elif self.isFail(): + self.output += '\n' + "#"*80 + '\nTester failed, reason: ' + self.getStatusMessage() + '\n' + # If the tester has not yet failed, append additional information to output + else: + # Read the output either from the temporary file or redirected files + if self.hasRedirectedOutput(options): + redirected_output = util.getOutputFromFiles(self, options) + self.output += redirected_output + + # If we asked for redirected output but none was found, we'll call that a failure + if redirected_output == '': + self.setStatus(self.fail, 'FILE TIMEOUT') + self.output += '\n' + "#"*80 + '\nTester failed, reason: ' + self.getStatusMessage() + '\n' diff --git a/python/TestHarness/testers/XMLDiff.py b/python/TestHarness/testers/XMLDiff.py index f6145293c715..186a0d216e8d 100644 --- a/python/TestHarness/testers/XMLDiff.py +++ b/python/TestHarness/testers/XMLDiff.py @@ -29,9 +29,6 @@ def prepare(self, options): if self.specs['delete_output_before_running'] == True: util.deleteFilesAndFolders(self.getTestDir(), self.specs['xmldiff']) - def processResults(self, moose_dir, options, output): - return SchemaDiff.processResults(self, moose_dir, options, output) - def load_file(self, path1): import xmltodict with open(path1,"r") as f: From 62317c9918e091cb96cd75055b82ef463329130a Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 21 May 2024 10:26:45 -0600 Subject: [PATCH 109/243] Keep track of error state to manage max failures --- python/TestHarness/schedulers/RunPBS.py | 2 +- python/TestHarness/schedulers/Scheduler.py | 34 ++++++++++++++-------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 1dea559be71f..f99874b818c4 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -197,7 +197,7 @@ def killRemaining(self, keyboard=False): if not hpc_job.done: hpc_job.killed = True - RunParallel.killRemaining(self, keyboard) + super().killRemaining(keyboard) def getOutputEndingComment(self, job_id): return f'TESTHARNESS RUNPBS FILE TERMINATOR FOR {job_id}' diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index a55931422516..1a170c942d1d 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -99,6 +99,9 @@ def __init__(self, harness, params): # Allow threads to set a global exception self.__error_state = False + # Lock for __error_state + self.__error_state_lock = threading.Lock() + # Private set of jobs currently running self.__active_jobs = set([]) @@ -134,7 +137,8 @@ def availableSlots(self, params): return available_slots, soft_limit def triggerErrorState(self): - self.__error_state = True + with self.__error_state_lock: + self.__error_state = True self.run_pool.close() self.status_pool.close() @@ -143,11 +147,9 @@ def killRemaining(self, keyboard=False): with self.activity_lock: for job in self.__active_jobs: job.killProcess() + self.triggerErrorState() if keyboard: - self.triggerErrorState() self.harness.keyboard_interrupt() - else: - self.triggerErrorState() def retrieveJobs(self): """ return all the jobs the scheduler was tasked to perform work for """ @@ -159,7 +161,9 @@ def retrieveDAGs(self): def schedulerError(self): """ boolean if the scheduler prematurely exited """ - return self.__error_state and not self.maxFailures() + with self.__error_state_lock: + error_state = self.__error_state + return error_state and not self.maxFailures() def maxFailures(self): """ Boolean for hitting max failures """ @@ -213,13 +217,17 @@ def waitFinish(self): with self.__bank_lock: if not self.__job_bank: break + with self.__error_state_lock: + error_state = self.__error_state + if error_state: + break sleep(0.1) # Completed all jobs sanity check - if not self.__error_state and self.__job_bank: + if not error_state and self.__job_bank: raise SchedulerError('Scheduler exiting with different amount of work than what was initially tasked!') - if not self.__error_state: + if not error_state: self.run_pool.close() self.run_pool.join() self.status_pool.close() @@ -246,8 +254,9 @@ def schedule(self, testers): This process is serial. """ # If we are not to schedule any more jobs for some reason, return now - if self.__error_state: - return + with self.__error_state_lock: + if self.__error_state: + return # Nothing to do if there aren't any testers if not testers: return @@ -461,8 +470,9 @@ def runJob(self, job, jobs): """ Method the run_pool calls when an available thread becomes ready """ # Its possible, the queue is just trying to empty. Allow it to do so # with out generating overhead - if self.__error_state: - return + with self.__error_state_lock: + if self.__error_state: + return try: # see if we have enough slots to start this job @@ -497,7 +507,7 @@ def runJob(self, job, jobs): self.run(job) # Hand execution over to derived scheduler except Exception: with job.getLock(): - job.setStatus(StatusSystem().error, 'JOB EXCEPTION') + job.setStatus(StatusSystem().error, 'JOB RUN EXCEPTION') job.appendOutput('Encountered an exception while running Job: %s' % (traceback.format_exc())) if timeout_timer: From 6245b88d0c7498930c55d2032fdc6882bc7197cc Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 21 May 2024 11:03:28 -0600 Subject: [PATCH 110/243] Remove unused variable, initialize error_state --- python/TestHarness/schedulers/Scheduler.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index 1a170c942d1d..753fe7b2fb5a 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -112,10 +112,6 @@ def __init__(self, harness, params): # The last time the scheduler reported something self.last_reported_time = clock() - # True when scheduler.waitFinish() is called. This alerts the scheduler, no more jobs are - # to be scheduled. KeyboardInterrupts are then handled by the thread pools. - self.__waiting = False - # Whether or not to report long running jobs as RUNNING self.report_long_jobs = True # Whether or not to enforce the timeout of jobs @@ -210,8 +206,9 @@ def waitFinish(self): Inform the Scheduler to begin running. Block until all jobs finish. """ self.__sortAndLaunch() - self.__waiting = True try: + error_state = False + # wait until there is an error, or job_bank has emptied while True: with self.__bank_lock: From cc8868a39074ade49c477037b74b16bf14d70076 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 21 May 2024 11:12:44 -0600 Subject: [PATCH 111/243] Clean up Job.getOutput() and add newlines when missing --- python/TestHarness/schedulers/Job.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index b1fb8b33a6ca..837b8f0292ed 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -365,16 +365,23 @@ def getEndTime(self): def getOutput(self): """ Return the combined contents of output """ + # Cached output is used when reading from a results file, + # when we don't run anything and just populate results if self.cached_output: return self.cached_output + # Concatenate output in order of Runner, Tester, Job output = '' - if self._runner and self._runner.getOutput(): - output += self._runner.getOutput() - if self.__tester and self.__tester.getOutput(): - output += self.__tester.getOutput() - if self.output: - output += self.output + object_outputs = [self.getRunner().getOutput() if self.getRunner() else '', + self.getTester().getOutput() if self.getTester else '', + self.output] + for object_output in object_outputs: + if object_output: + # Append an extra line if we're missing one + if len(output) and output[-1] != '\n': + output += '\n' + output += object_output + return output def getRunner(self): From 4f39a2a6be162aa9d4623aa2de31debde58b3a77 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 21 May 2024 11:17:47 -0600 Subject: [PATCH 112/243] Append instead of setting --- python/TestHarness/JobDAG.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/JobDAG.py b/python/TestHarness/JobDAG.py index 16ee681c0e36..660ec2baee59 100644 --- a/python/TestHarness/JobDAG.py +++ b/python/TestHarness/JobDAG.py @@ -103,7 +103,7 @@ def _addEdge(self, child, parent): err_output += ' %s <--> %s' % (parent.getTestName().split('.')[1], child.getTestName().split('.')[1]) - parent.setOutput('Cyclic dependency error!\n\t' + err_output) + parent.appendOutput('Cyclic dependency error!\n\t' + err_output) parent.setStatus(parent.error, 'Cyclic or Invalid Dependency Detected!') def _setupPrereqs(self): From ea4ce8641ba20ee0f424b8cd89c9bdd6698848d7 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 21 May 2024 11:17:53 -0600 Subject: [PATCH 113/243] Simplify len call --- python/TestHarness/schedulers/Job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 837b8f0292ed..1689b49634ac 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -378,7 +378,7 @@ def getOutput(self): for object_output in object_outputs: if object_output: # Append an extra line if we're missing one - if len(output) and output[-1] != '\n': + if output and output[-1] != '\n': output += '\n' output += object_output From bcfa787e88f5b416fdfd91d5f0998f8991f71736 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 21 May 2024 11:24:08 -0600 Subject: [PATCH 114/243] Make a function call --- python/TestHarness/schedulers/Job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 1689b49634ac..342cf7e4d70e 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -373,7 +373,7 @@ def getOutput(self): # Concatenate output in order of Runner, Tester, Job output = '' object_outputs = [self.getRunner().getOutput() if self.getRunner() else '', - self.getTester().getOutput() if self.getTester else '', + self.getTester().getOutput() if self.getTester() else '', self.output] for object_output in object_outputs: if object_output: From 6fe70461e31d599fd36098c68d4a98bfe38a6a79 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 21 May 2024 11:57:07 -0600 Subject: [PATCH 115/243] More escape madness --- python/TestHarness/schedulers/RunHPC.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index d7f8b4713a34..9c7679cac589 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -265,8 +265,8 @@ def submitJob(self, job): # that is quite bash like. command = tester.getCommand(options) command = command.replace('\n', ' ') + command = command.replace('"', "'") command = command.replace("'", "\'\\'\'") - command = command.replace('${', '\${') # Special logic for when we're running with apptainer, in which case # we need to manipulate the command like such From d3248e8be51663626b8647347ace3907e7537cf7 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 21 May 2024 15:05:32 -0600 Subject: [PATCH 116/243] Correct PetscJacobianTester processing --- python/TestHarness/testers/PetscJacobianTester.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/TestHarness/testers/PetscJacobianTester.py b/python/TestHarness/testers/PetscJacobianTester.py index 5a5326c23ef0..2cb7fb8fa92c 100644 --- a/python/TestHarness/testers/PetscJacobianTester.py +++ b/python/TestHarness/testers/PetscJacobianTester.py @@ -96,18 +96,18 @@ def __compare(self, value, threshold): return False def processResults(self, moose_dir, options, exit_code, runner_output): - output = super().processResults(moose_dir, options, exit_code, runner_output) + output = '' if self.old_petsc: if self.specs['state'].lower() == 'user': m = re.search("Norm of matrix ratio (\S+?),? difference (\S+) \(user-defined state\)", - runner_output, re.MULTILINE | re.DOTALL); + runner_output, re.MULTILINE | re.DOTALL) elif self.specs['state'].lower() == 'const_positive': m = re.search("Norm of matrix ratio (\S+?),? difference (\S+) \(constant state 1\.0\)", - runner_output, re.MULTILINE | re.DOTALL); + runner_output, re.MULTILINE | re.DOTALL) elif self.specs['state'].lower() == 'const_negative': m = re.search("Norm of matrix ratio (\S+?),? difference (\S+) \(constant state -1\.0\)", - runner_output, re.MULTILINE | re.DOTALL); + runner_output, re.MULTILINE | re.DOTALL) else: self.setStatus("state must be either 'user', const_positive', or 'const_negative'", self.bucket_fail) From a5c86cdfa09fb742d53fcf1939c90bc4d4b440ef Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 21 May 2024 21:08:33 -0600 Subject: [PATCH 117/243] Allow for one failure when requesting job status --- python/TestHarness/schedulers/RunHPC.py | 15 +++++++++++++-- python/TestHarness/schedulers/RunPBS.py | 5 ++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 9c7679cac589..4151200514f7 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -40,6 +40,9 @@ def __init__(self, harness, params): # Map of Job -> HPCJob self.hpc_jobs = {} + # Whether or not the last job status command failed. We let it + # fail once if it passes the second time for some redundancy + self.update_jobs_failed = False # The jump hostname for running commands, if any self.ssh_hosts = self.options.hpc_host # The SSH key to use for connections @@ -359,7 +362,14 @@ def getHPCJob(self, job): # Only update the statues periodically as this is called across threads if self.hpc_jobs_status_timer is None or ((clock() - self.hpc_jobs_status_timer) > self.hpc_jobs_update_interval): - self.updateJobs() + success = self.updateJobs() + if not success: + if self.update_jobs_failed: + raise Exception('Failed to get job status') + self.update_jobs_failed = True + else: + self.update_jobs_failed = False + self.hpc_jobs_status_timer = clock() return self.hpc_jobs.get(job) @@ -368,7 +378,8 @@ def updateJobs(self): """ Updates the underlying jobs. - Should be overridden. + Should be overridden and should return True or False + depending on whether or not the update succeeded. """ raise Exception('Unimplemented updateJobs()') diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index f99874b818c4..5929815f77f1 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -118,7 +118,7 @@ def updateJobs(self): cmd = ['qstat', '-xf', '-F', 'json'] + active_job_ids exit_code, result, _ = self.callHPC(' '.join(cmd)) if exit_code != 0: - raise self.CallHPCException(self, 'Failed to get job status', cmd, result) + return False # Attempt to parse the status from the jobs try: @@ -168,6 +168,9 @@ def updateJobs(self): except Exception as e: raise self.CallHPCException(self, f'Failed to parse collective job status', cmd, result) from e + # Success + return True + def killJob(self, job): """Kills a PBS job""" with self.hpc_jobs_lock: From 95402d094c2925498e95c6d27b409f8911e2474a Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 21 May 2024 21:08:53 -0600 Subject: [PATCH 118/243] Don't process results if the Runner failed --- python/TestHarness/schedulers/Job.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 342cf7e4d70e..a1149178e984 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -332,6 +332,12 @@ def try_catch(do, exception_name): self.__start_time = self.timer.starts[0] self.__end_time = self.timer.ends[-1] + # Job error occurred, which means the Runner didn't complete + # so don't process anything else + if self.isError(): + self.cleanup() + return + if self.options.pedantic_checks and self.canParallel(): # Check if the files we checked on earlier were modified. self.fileChecker.get_all_files(self, self.fileChecker.getNewTimes()) From 8adc68a50a221d6a23b565daef3977453a7b9195 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 22 May 2024 08:12:44 -0600 Subject: [PATCH 119/243] Simplify output reading --- python/TestHarness/runners/HPCRunner.py | 11 ++-- python/TestHarness/runners/Runner.py | 53 ++++++++++++++----- .../TestHarness/runners/SubprocessRunner.py | 15 ++---- python/TestHarness/schedulers/Job.py | 9 +++- python/TestHarness/schedulers/RunParallel.py | 8 ++- python/TestHarness/testers/RunApp.py | 2 +- python/TestHarness/testers/RunException.py | 12 ----- python/TestHarness/testers/Tester.py | 15 ++---- python/TestHarness/util.py | 46 ++++------------ 9 files changed, 74 insertions(+), 97 deletions(-) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index 1082479d8699..b07023a4a4e4 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -7,8 +7,9 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html -from TestHarness.runners.Runner import Runner import re, time, os, subprocess +from TestHarness.runners.Runner import Runner +from TestHarness import util class HPCRunner(Runner): """ @@ -116,10 +117,8 @@ def wait(self, timer): self.trySetOutput() def print_files(files, type): if files: - self.output += '#' * 80 + f'\n{type} output file(s)\n' + '#' * 80 + '\n' - for file in files: - self.output += file + '\n' - self.output += '\n' + self.output += util.outputHeader(f'{type} output file(s)') + self.output += f'{"\n".join(files)}\n' print_files(wait_files, 'Unavailable') print_files(incomplete_files, 'Incomplete') break @@ -324,7 +323,7 @@ def readTruncated(file, start_lines=500, end_lines=500): # Form the combined output output = head if truncated: - output += f'{"#" * 80}\nOUTPUT TRIMMED\n{"#" * 80}\n' + output += util.outputHeader('OUTPUT TRIMMED') if tail: tail.reverse() output += '\n'.join(tail) diff --git a/python/TestHarness/runners/Runner.py b/python/TestHarness/runners/Runner.py index 5621fe6e84aa..876e801ea737 100644 --- a/python/TestHarness/runners/Runner.py +++ b/python/TestHarness/runners/Runner.py @@ -7,7 +7,8 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html -import json +import os, json +from TestHarness import util class Runner: """ @@ -53,30 +54,41 @@ def kill(self): """ pass - def getOutput(self): + def finalize(self): """ - Gets the combined output of the process. + Finalizes the output, which should be called at the end of wait() """ - output = self.output + # Load the redirected output files, if any + for file_path in self.job.getTester().getRedirectedOutputFiles(self.options): + self.output += util.outputHeader(f'Redirected output {file_path}') + if os.access(file_path, os.R_OK): + with open(file_path, 'r+b') as f: + self.output += self.readOutput(f) + else: + self.job.setStatus(self.job.error, 'FILE TIMEOUT') + self.output += 'FILE UNAVAILABLE\n' # Check for invalid unicode in output try: - json.dumps(output) + json.dumps(self.output) except UnicodeDecodeError: # Convert invalid output to something json can handle - output = output.decode('utf-8','replace').encode('ascii', 'replace') + self.output = self.output.decode('utf-8','replace').encode('ascii', 'replace') # Alert the user that output has invalid characters - self.job.addCaveats('invalid characters in stdout') + self.job.addCaveats('invalid characters in output') # Remove NULL output and fail if it exists null_chars = ['\0', '\x00'] for null_char in null_chars: - if null_char in output: - output = output.replace(null_char, 'NULL') - if not self.job.isFail(): - self.job.setStatus(self.job.error, 'NULL characters in output') + if null_char in self.output: + self.output = self.output.replace(null_char, 'NULL') + self.job.setStatus(self.job.error, 'NULL characters in output') - return output + def getOutput(self): + """ + Gets the combined output of the process. + """ + return self.output def getExitCode(self): """ @@ -91,3 +103,20 @@ def sendSignal(self, signal): Can be overridden. """ raise Exception('sendSignal not supported for this Runner') + + def readOutput(self, stream): + """ + Helper for reading output from a stream, and setting an error state + if the read failed. + """ + output = '' + try: + stream.seek(0) + output = stream.read().decode('utf-8') + except UnicodeDecodeError: + self.job.setStatus(self.job.error, 'non-unicode characters in output') + except: + self.job.setStatus(self.job.error, 'error reading output') + if output and output[-1] != '\n': + output += '\n' + return output diff --git a/python/TestHarness/runners/SubprocessRunner.py b/python/TestHarness/runners/SubprocessRunner.py index 3d8b4c790c1a..216dc3cd1f55 100644 --- a/python/TestHarness/runners/SubprocessRunner.py +++ b/python/TestHarness/runners/SubprocessRunner.py @@ -26,8 +26,6 @@ def __init__(self, job, options): self.errfile = None # The underlying subprocess self.process = None - # The joined output - self.output = None def spawn(self, timer): tester = self.job.getTester() @@ -80,13 +78,11 @@ def wait(self, timer): timer.stop() self.exit_code = self.process.poll() - self.outfile.flush() - self.errfile.flush() - # store the contents of output, and close the file - self.output = util.readOutput(self.outfile, self.errfile, self.job.getTester()) - self.outfile.close() - self.errfile.close() + for file in [self.outfile, self.errfile]: + file.flush() + self.output += self.readOutput(file) + file.close() def kill(self): if self.process is not None: @@ -103,9 +99,6 @@ def kill(self): except OSError: # Process already terminated pass - def getOutput(self): - return self.output - def sendSignal(self, signal): # process.poll() returns the process's exit code if it has completed, # and None if it is still running. This acts as a safety precaution diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index a1149178e984..9a2e14d6dd5f 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -7,11 +7,12 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html -import itertools, re, os, json, time, threading +import itertools, re, os, time, threading from timeit import default_timer as clock from TestHarness.StatusSystem import StatusSystem from TestHarness.FileChecker import FileChecker from TestHarness.runners.Runner import Runner +from TestHarness import util from tempfile import TemporaryDirectory import traceback @@ -310,7 +311,8 @@ def try_catch(do, exception_name): except: self.cleanup() self.setStatus(self.error, f'{exception_name} EXCEPTION') - self.output += '\n\nPython exception encountered:\n' + traceback.format_exc() + self.output += util.outputHeader('Python exception encountered') + self.output += traceback.format_exc() return False return True @@ -338,6 +340,9 @@ def try_catch(do, exception_name): self.cleanup() return + # And do finalize (really just cleans up output) + self._runner.finalize() + if self.options.pedantic_checks and self.canParallel(): # Check if the files we checked on earlier were modified. self.fileChecker.get_all_files(self, self.fileChecker.getNewTimes()) diff --git a/python/TestHarness/schedulers/RunParallel.py b/python/TestHarness/schedulers/RunParallel.py index 82867bac45b5..65218e9c8835 100644 --- a/python/TestHarness/schedulers/RunParallel.py +++ b/python/TestHarness/schedulers/RunParallel.py @@ -10,7 +10,6 @@ import traceback from TestHarness.schedulers.Scheduler import Scheduler -from TestHarness.StatusSystem import StatusSystem from TestHarness import util from TestHarness.runners.SubprocessRunner import Runner, SubprocessRunner from TestHarness.testers.Tester import Tester @@ -50,8 +49,6 @@ def run(self, job): job.cached_output = job_results['OUTPUT'] return - output = '' - # Anything that throws while running or processing a job should be caught # and the job should fail try: @@ -62,8 +59,9 @@ def run(self, job): if not tester.isSkip() and not tester.isFail(): self.setSuccessfulMessage(tester) except Exception: - self.output += 'Python exception encountered:\n\n' + traceback.format_exc() - tester.setStatus(StatusSystem().error, 'JOB EXCEPTION') + job.appendOutput(util.outputHeader('Python exception encountered in Job')) + job.appendOutput(traceback.format_exc()) + job.setStatus(job.error, 'JOB EXCEPTION') if job.getOutputFile(): job.addMetaData(DIRTY_FILES=[job.getOutputFile()]) diff --git a/python/TestHarness/testers/RunApp.py b/python/TestHarness/testers/RunApp.py index 0148a77f4464..6a608f391e3e 100644 --- a/python/TestHarness/testers/RunApp.py +++ b/python/TestHarness/testers/RunApp.py @@ -347,7 +347,7 @@ def processResults(self, moose_dir, options, exit_code, runner_output): """ output = '' output += self.testFileOutput(moose_dir, options, runner_output) - output += self.testExitCodes(moose_dir, options, exit_code, output) + output += self.testExitCodes(moose_dir, options, exit_code, runner_output) return output diff --git a/python/TestHarness/testers/RunException.py b/python/TestHarness/testers/RunException.py index da627ab494d2..6a6aa5c1d36e 100644 --- a/python/TestHarness/testers/RunException.py +++ b/python/TestHarness/testers/RunException.py @@ -62,15 +62,3 @@ def getProcs(self, options): self.addCaveats('hpc max_cpus=1') return 1 return procs - - def processResults(self, moose_dir, options, exit_code, runner_output): - output = '' - - # Exceptions are written to stderr, which can be interleaved so we normally redirect these - # separate files. Here we must gather those file outputs before processing - if self.hasRedirectedOutput(options): - runner_output = util.getOutputFromFiles(self, options) - - output += super().processResults(moose_dir, options, exit_code, runner_output) - - return output diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index ae27bed41ea9..6728b66c3386 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -411,7 +411,9 @@ def hasRedirectedOutput(self, options): def getRedirectedOutputFiles(self, options): """ return a list of redirected output """ - return [os.path.join(self.getTestDir(), self.name() + '.processor.{}'.format(p)) for p in range(self.getProcs(options))] + if self.hasRedirectedOutput(options): + return [os.path.join(self.getTestDir(), self.name() + '.processor.{}'.format(p)) for p in range(self.getProcs(options))] + return [] def addCaveats(self, *kwargs): """ Add caveat(s) which will be displayed with the final test status """ @@ -765,14 +767,3 @@ def run(self, options, exit_code, runner_output): self.output += '\n' + "#"*80 + '\nTester skipped, reason: ' + self.getStatusMessage() + '\n' elif self.isFail(): self.output += '\n' + "#"*80 + '\nTester failed, reason: ' + self.getStatusMessage() + '\n' - # If the tester has not yet failed, append additional information to output - else: - # Read the output either from the temporary file or redirected files - if self.hasRedirectedOutput(options): - redirected_output = util.getOutputFromFiles(self, options) - self.output += redirected_output - - # If we asked for redirected output but none was found, we'll call that a failure - if redirected_output == '': - self.setStatus(self.fail, 'FILE TIMEOUT') - self.output += '\n' + "#"*80 + '\nTester failed, reason: ' + self.getStatusMessage() + '\n' diff --git a/python/TestHarness/util.py b/python/TestHarness/util.py index 523e9d41409a..17474c35dade 100644 --- a/python/TestHarness/util.py +++ b/python/TestHarness/util.py @@ -869,42 +869,6 @@ def deleteFilesAndFolders(test_dir, paths, delete_folders=True): # TL;DR; Just pass... pass -# Check if test has any redirected output, and if its ready to be read -def checkOutputReady(tester, options): - checked_files = [] - for redirected_file in tester.getRedirectedOutputFiles(options): - file_path = os.path.join(tester.getTestDir(), redirected_file) - if os.access(file_path, os.R_OK): - checked_files.append(file_path) - return checked_files - -# return concatenated output from tests with redirected output -def getOutputFromFiles(tester, options): - file_output = '' - output_files = checkOutputReady(tester, options) - for file_path in output_files: - with open(file_path, 'r+b') as f: - file_output += "#"*80 + "\nOutput from " + file_path \ - + "\n" + "#"*80 + "\n" + readOutput(f, None, tester) - return file_output - -# Read stdout and stderr file objects, append error and return the string -def readOutput(stdout, stderr, tester): - output = '' - try: - if stdout: - stdout.seek(0) - output += stdout.read().decode('utf-8') - if stderr: - stderr.seek(0) - output += stderr.read().decode('utf-8') - except UnicodeDecodeError: - tester.setStatus(tester.fail, 'non-unicode characters in output') - except: - tester.setStatus(tester.fail, 'error while attempting to read output files') - - return output - # Trimming routines for job output def trimOutput(job, options): output = job.getOutput() @@ -926,3 +890,13 @@ def trimOutput(job, options): "#"*80, "#"*80, output[-second_part:]) + +def outputHeader(header): + """ + Returns text for output with a visual separator, i.e.: + ##############################... +
+ ##############################... + """ + sep = '#' * 80 + return f'{sep}\n{header}\n{sep}\n' From e64030afa1a65cc526c7aa184b566995af013395 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 22 May 2024 08:14:38 -0600 Subject: [PATCH 120/243] Don't really need to catch a specific exception here --- python/TestHarness/schedulers/RunParallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/RunParallel.py b/python/TestHarness/schedulers/RunParallel.py index 65218e9c8835..aaa6ec1ae107 100644 --- a/python/TestHarness/schedulers/RunParallel.py +++ b/python/TestHarness/schedulers/RunParallel.py @@ -58,7 +58,7 @@ def run(self, job): # Set the successful message if not tester.isSkip() and not tester.isFail(): self.setSuccessfulMessage(tester) - except Exception: + except: job.appendOutput(util.outputHeader('Python exception encountered in Job')) job.appendOutput(traceback.format_exc()) job.setStatus(job.error, 'JOB EXCEPTION') From b872179431fddf3fcb7f122f265aee84c6f52cc2 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 22 May 2024 08:17:17 -0600 Subject: [PATCH 121/243] Correct syntax --- python/TestHarness/runners/HPCRunner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index b07023a4a4e4..0af76d96cd9c 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -118,7 +118,7 @@ def wait(self, timer): def print_files(files, type): if files: self.output += util.outputHeader(f'{type} output file(s)') - self.output += f'{"\n".join(files)}\n' + self.output += '\n'.join(files) + '\n' print_files(wait_files, 'Unavailable') print_files(incomplete_files, 'Incomplete') break From 6abff401b1f587a0756ccd3f3199c0f757d42ab5 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 22 May 2024 08:45:35 -0600 Subject: [PATCH 122/243] Add CODEOWNERS to the test harness --- CODEOWNERS | 1 + 1 file changed, 1 insertion(+) diff --git a/CODEOWNERS b/CODEOWNERS index 9797647bfc5c..84c1c00d6c61 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -77,6 +77,7 @@ /python/MooseDocs @cticenhour /python/moosesqa @cticenhour +/python/TestHarness @loganharbour @milljm /scripts/hpc_proxy.pac @loganharbour /scripts/configure_petsc.sh @cticenhour @milljm @loganharbour From 1db66351009e68b52de50853a15bea697ffdb6ad Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 22 May 2024 10:43:54 -0600 Subject: [PATCH 123/243] This doesn't use results from parent testers --- python/TestHarness/testers/AnalyzeJacobian.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/TestHarness/testers/AnalyzeJacobian.py b/python/TestHarness/testers/AnalyzeJacobian.py index 9cab8a93495d..1f7f52731906 100644 --- a/python/TestHarness/testers/AnalyzeJacobian.py +++ b/python/TestHarness/testers/AnalyzeJacobian.py @@ -72,8 +72,6 @@ def getCommand(self, options): def processResults(self, moose_dir, options, exit_code, runner_output): - output = super().processResults(moose_dir, options, exit_code, runner_output) - reason = '' specs = self.specs if specs.isValid('expect_out'): @@ -89,7 +87,7 @@ def processResults(self, moose_dir, options, exit_code, runner_output): if reason != '': self.setStatus(self.fail, reason) - return output + return '' def checkRunnable(self, options): # We cannot rely on an external script running things within HPC From d20c427ef27511bdf0766440bf065b71d2610954 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 22 May 2024 10:44:12 -0600 Subject: [PATCH 124/243] Skip valgrind tests for AnalyzeJacobian --- python/TestHarness/testers/AnalyzeJacobian.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/TestHarness/testers/AnalyzeJacobian.py b/python/TestHarness/testers/AnalyzeJacobian.py index 1f7f52731906..6e833b71d121 100644 --- a/python/TestHarness/testers/AnalyzeJacobian.py +++ b/python/TestHarness/testers/AnalyzeJacobian.py @@ -96,4 +96,10 @@ def checkRunnable(self, options): self.setStatus(self.skip) return False + # This doesn't pass valgrind arguments + if options.valgrind_mode: + self.addCaveats('valgrind=false') + self.setStatus(self.skip) + return False + return FileTester.checkRunnable(self, options) From 4eda7840bc10d070e69a3de34364ae3d2cb1af5c Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 22 May 2024 10:47:54 -0600 Subject: [PATCH 125/243] Disable parallelism with no_additional_cli_args --- python/TestHarness/testers/RunApp.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/TestHarness/testers/RunApp.py b/python/TestHarness/testers/RunApp.py index 6a608f391e3e..5e9248dc2058 100644 --- a/python/TestHarness/testers/RunApp.py +++ b/python/TestHarness/testers/RunApp.py @@ -127,6 +127,10 @@ def checkRunnable(self, options): return True def getThreads(self, options): + # This disables additional arguments + if self.specs['no_additional_cli_args']: + return 1 + #Set number of threads to be used lower bound nthreads = max(options.nthreads, int(self.specs['min_threads'])) #Set number of threads to be used upper bound @@ -140,6 +144,10 @@ def getThreads(self, options): return nthreads def getProcs(self, options): + # This disables additional arguments + if self.specs['no_additional_cli_args']: + return 1 + if options.parallel == None: default_ncpus = 1 else: From bc2e231f99e5ca787dbd24ef8849f35c34514a39 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 22 May 2024 11:06:31 -0600 Subject: [PATCH 126/243] Allow should_execute = true to still run the Tester --- python/TestHarness/schedulers/Job.py | 32 ++++++++++++++++------------ 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 9a2e14d6dd5f..9f1c9f06ac2c 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -279,8 +279,24 @@ def run(self): """ tester = self.__tester - # Do not execute app, but allow processResults to commence + # Helper for trying and catching + def try_catch(do, exception_name): + try: + do() + except: + self.cleanup() + self.setStatus(self.error, f'{exception_name} EXCEPTION') + self.output += util.outputHeader('Python exception encountered') + self.output += traceback.format_exc() + return False + return True + + # Do not execute app, but still run the tester + # This is truly awful and I really hate that it got put in here, + # please remove it if you can. if not tester.shouldExecute(): + run_tester = lambda: tester.run(self.options, 0, '') + try_catch(run_tester, 'TESTER RUN') return if self.options.pedantic_checks and self.canParallel(): @@ -304,18 +320,6 @@ def run(self): self.__start_time = clock() - # Helper for trying and catching - def try_catch(do, exception_name): - try: - do() - except: - self.cleanup() - self.setStatus(self.error, f'{exception_name} EXCEPTION') - self.output += util.outputHeader('Python exception encountered') - self.output += traceback.format_exc() - return False - return True - # Spawn the process spawn = lambda: self._runner.spawn(self.timer) if not try_catch(spawn, 'RUNNER SPAWN'): @@ -352,7 +356,7 @@ def try_catch(do, exception_name): runner_output = self._runner.getOutput() exit_code = self._runner.getExitCode() run_tester = lambda: tester.run(self.options, exit_code, runner_output) - try_catch(run_tester, 'TESTER PROCESS') + try_catch(run_tester, 'TESTER RUN') # Run cleanup now that we're done self.cleanup() From 1724bb737a260b52a263d0cecc47d3bd31bb836a Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 22 May 2024 11:07:06 -0600 Subject: [PATCH 127/243] Also check the job failure status when setting a success --- python/TestHarness/schedulers/RunParallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/RunParallel.py b/python/TestHarness/schedulers/RunParallel.py index aaa6ec1ae107..0ee44473a9cf 100644 --- a/python/TestHarness/schedulers/RunParallel.py +++ b/python/TestHarness/schedulers/RunParallel.py @@ -56,7 +56,7 @@ def run(self, job): job.run() # Set the successful message - if not tester.isSkip() and not tester.isFail(): + if not tester.isSkip() and not job.isFail(): self.setSuccessfulMessage(tester) except: job.appendOutput(util.outputHeader('Python exception encountered in Job')) From ecd65847e3674e5d06af2ac63ed0e72de8bfebbb Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 23 May 2024 17:28:27 -0600 Subject: [PATCH 128/243] Add slurm implementation --- .gitignore | 4 +- python/TestHarness/TestHarness.py | 6 +- python/TestHarness/runners/HPCRunner.py | 26 +- python/TestHarness/schedulers/RunHPC.py | 266 +++++++++++++----- python/TestHarness/schedulers/RunPBS.py | 193 +++---------- python/TestHarness/schedulers/RunSlurm.py | 90 ++++++ .../schedulers/{pbs_template => hpc_template} | 19 +- 7 files changed, 358 insertions(+), 246 deletions(-) create mode 100644 python/TestHarness/schedulers/RunSlurm.py rename python/TestHarness/schedulers/{pbs_template => hpc_template} (77%) diff --git a/.gitignore b/.gitignore index bea0d73b9183..daa5c91444c3 100644 --- a/.gitignore +++ b/.gitignore @@ -327,6 +327,8 @@ share/ /tutorials/tutorial01_app_development/*/babbler.yaml /tutorials/darcy_thermo_mech/*/darcy_thermo_mech.yaml -# test harness pbs output +# test harness hpc output pbs_*.qsub pbs_*.out +slurm_*.sbatch +slurm_*.out diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index df4b7efe6734..33b41c88b726 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -902,6 +902,8 @@ def initialize(self, argv, app_name): if self.options.hpc == 'pbs': scheduler_plugin = 'RunPBS' + elif self.options.hpc == 'slurm': + scheduler_plugin = 'RunSlurm' # The default scheduler plugin else: scheduler_plugin = 'RunParallel' @@ -1060,14 +1062,14 @@ def parseCLArgs(self, argv): # Options for HPC execution hpcgroup = parser.add_argument_group('HPC Options', 'Options controlling HPC execution') - hpcgroup.add_argument('--hpc', dest='hpc', action='store', choices=['pbs'], help='Launch tests using a HPC scheduler') + hpcgroup.add_argument('--hpc', dest='hpc', action='store', choices=['pbs', 'slurm'], help='Launch tests using a HPC scheduler') hpcgroup.add_argument('--hpc-host', nargs='+', action='store', dest='hpc_host', metavar='', help='The host(s) to use for submitting HPC jobs') hpcgroup.add_argument('--hpc-pre-source', nargs=1, action="store", dest='hpc_pre_source', metavar='', help='Source specified file before launching HPC tests') hpcgroup.add_argument('--hpc-file-timeout', nargs=1, type=int, action='store', dest='hpc_file_timeout', default=120, help='The time in seconds to wait for HPC output') hpcgroup.add_argument('--hpc-place', nargs=1, action='store', dest='hpc_place', choices=['free', 'pack', 'scatter'], default='free', help='The default placement method for HPC jobs') hpcgroup.add_argument('--hpc-apptainer-bindpath', nargs=1, action='store', type=str, dest='hpc_apptainer_bindpath', help='Sets the apptainer bindpath for HPC jobs') hpcgroup.add_argument('--hpc-apptainer-no-home', action='store_true', dest='hpc_apptainer_no_home', help='Passes --no-home to apptainer for HPC jobs') - hpcgroup.add_argument('--pbs-project', nargs=1, action='store', dest='pbs_project', type=str, default='moose', metavar='', help='Identify your job(s) with this project (default: %(default)s)') + hpcgroup.add_argument('--hpc-project', nargs=1, action='store', dest='hpc_project', type=str, default='moose', metavar='', help='Identify your job(s) with this project (default: %(default)s)') hpcgroup.add_argument('--pbs-queue', nargs=1, action='store', dest='hpc_queue', type=str, metavar='', help='Submit jobs to the specified queue') # Try to find the terminal size if we can diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index 0af76d96cd9c..c37a31110a60 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -21,8 +21,10 @@ def __init__(self, job, options, run_hpc): # The RunHPC object self.run_hpc = run_hpc - # The HPCJob object, updated in wait() - self.hpc_job = None + # The HPC job id, used for the file terminator + self.hpc_job_id = None + # The command ran in the HPC job, used to set later + self.hpc_job_command = None # Interval in seconds for polling for job status self.job_status_poll_time = 0.1 @@ -35,7 +37,7 @@ def __init__(self, job, options, run_hpc): def spawn(self, timer): # Rely on the RunHPC object to submit the job - self.run_hpc.submitJob(self.job) + self.hpc_job_id, self.hpc_job_command = self.run_hpc.submitJob(self.job) timer.start() @@ -48,11 +50,10 @@ def wait(self, timer): # polling itself is only done on occasion within RunHPC while True: time.sleep(self.job_status_poll_time) - self.hpc_job = self.run_hpc.getHPCJob(self.job) + self.exit_code = self.run_hpc.getHPCJobStatus(self.job) # We're done - if self.hpc_job.done: - self.exit_code = self.hpc_job.exit_code + if self.exit_code is not None: break timer.stop() @@ -63,9 +64,6 @@ def wait(self, timer): # If the Job is already finished, something happened in PBS # so we have an invalid state for processing in the Tester if self.job.isFinished(): - if self.exit_code is None: - self.exit_code = -1 - # If we have _some_ output, at least try to load it. for i in range(int(self.options.hpc_file_timeout / self.file_completion_poll_time)): if self.trySetOutput(): @@ -79,7 +77,7 @@ def wait(self, timer): # We've actually ran something now and not just qsub, so update the # command to what was ran there - tester.setCommandRan(self.hpc_job.command) + tester.setCommandRan(self.hpc_job_command) # Determine the output files that we need to wait for to be complete wait_files = set([output_file]) @@ -151,12 +149,6 @@ def trySetOutput(self, required=False): else: self.output = self.readTruncated(output_file) - # If we can parse the exit code here, do it. Sometimes PBS - # will do screwy stuff with not capturing the actual exit code... - find_exit_code = re.search('Completed TestHarness RunHPC test execution; exit code = (\d+)', self.output) - if find_exit_code: - self.exit_code = int(find_exit_code.group(1)) - did_set = True except: pass @@ -189,7 +181,7 @@ def fileIsReady(self, file): if is_binary is None: return False - ending_comment = self.run_hpc.getOutputEndingComment(self.hpc_job.id) + ending_comment = self.run_hpc.getOutputEndingComment(self.hpc_job_id) # Binary file if is_binary: diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 4151200514f7..cccaea8b47dc 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -8,8 +8,9 @@ #* https://www.gnu.org/licenses/lgpl-2.1.html from RunParallel import RunParallel -import threading, os, re, sys, datetime, shlex +import threading, os, re, sys, datetime, shlex, socket import paramiko +import jinja2 from multiprocessing.pool import ThreadPool from timeit import default_timer as clock @@ -111,6 +112,10 @@ def __init__(self, harness, params): if self.options.hpc_pre_source: self.source_contents = open(self.options.hpc_pre_source, 'r').read() + # Load the submission template + template_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'hpc_template') + self.submission_template = open(template_path, 'r').read() + class HPCJob: """ Structure that represents the cached information about an HPC job @@ -217,50 +222,47 @@ def availableSlots(self, params): # Support managing 250 HPC jobs concurrently return 250, False - class JobData: - """ - Helper struct for storing the information to generate a job - """ - def __init__(self): - # The command to be ran in the job - self.command = None - # self.command but escaped so that it can be printed - self.command_printable = None - # The name of the job - self.name = None - # The number of procs to run the job with - self.num_procs = None - # The number of threads to run the job with - self.num_threads = None - # The combined stdout+stderr output file - self.output_file = None - # The additonal output files to be read (csv, exodus, etc) - self.additional_output_files = None - # The path to the submission script - self.submission_script = None - # The walltime to run the job with - self.walltime = None - def submitJob(self, job): """ Method for submitting an HPC job for the given Job. - Should be overridden. + Returns the job's ID and the command to be ran in the job. """ tester = job.getTester() options = self.options - job_data = self.JobData() + submission_script = self.getHPCJobSubmissionPath(job) + output_file = self.getHPCJobOutputPath(job) - # The submission script we're going to write to - job_data.submission_script = self.getHPCJobSubmissionPath(job) - # The combined stdout+stderr from the job - job_data.output_file = self.getHPCJobOutputPath(job) # Clean these two files - for file in [job_data.submission_script, job_data.output_file]: + for file in [submission_script, output_file]: if os.path.exists(file): os.remove(file) + # Add MOOSE's python path for python scripts + moose_python = os.path.abspath(os.path.join(os.path.abspath(os.path.dirname(__file__)), '../..')) + + # Start building the jinja environment for the submission script + submission_env = {'SCHEDULER_NAME': self.getHPCSchedulerName(), + 'NAME': self.getHPCJobName(job), + 'CWD': tester.getTestDir(), + 'OUTPUT': output_file, + 'SUBMISSION_SCRIPT': submission_script, + 'WALLTIME': str(datetime.timedelta(seconds=tester.getMaxTime())), + 'PROJECT': self.options.hpc_project, + 'TEST_SPEC': tester.getSpecFile(), + 'TEST_NAME': tester.getTestNameShort(), + 'SUBMITTED_HOSTNAME': socket.gethostname(), + 'MOOSE_PYTHONPATH': moose_python, + 'NUM_PROCS': tester.getProcs(options), + 'NUM_THREADS': tester.getThreads(options), + 'ENDING_COMMENT': self.getOutputEndingComment(f'${self.getHPCJobIDVariable()}'), + 'JOB_ID_VARIABLE': self.getHPCJobIDVariable()} + if self.options.hpc_pre_source: + submission_env['SOURCE_FILE'] = options.hpc_pre_source + if self.source_contents: + submission_env['SOURCE_CONTENTS'] = self.source_contents + # The command to be ran. We're going to wrap this command in single quotes # so that we don't bash evaluate anything, hence the replacement of a # single quote. Yes, this truly is madness. But it looks like it works. @@ -275,7 +277,7 @@ def submitJob(self, job): # we need to manipulate the command like such # Original command: # New command: apptainer exec /path/to/image '' - # This is also the reason why we have to form job_data.command_printable; + # This is also the reason why we have to form job_command_printable; # the extra quotes around need to be escaped. APPTAINER_CONTAINER = os.environ.get('APPTAINER_CONTAINER') if APPTAINER_CONTAINER: @@ -289,8 +291,8 @@ def submitJob(self, job): else: command_prefix = '' - job_data.command = command_prefix - job_data.command_printable = command_prefix + job_command = command_prefix + job_command_printable = command_prefix # The root filesystem path that we're in so that we can be sure to bind # it into the container, if not already set @@ -305,17 +307,15 @@ def submitJob(self, job): apptainer_command.append(APPTAINER_CONTAINER) apptainer_command = shlex.join(apptainer_command) # Append the apptainer command along with the command to be ran - job_data.command += f"{apptainer_command} '{command}'" - job_data.command_printable += f"{apptainer_command} \'\\'\'{command}\'\\'\'" + job_command += f"{apptainer_command} '{command}'" + job_command_printable += f"{apptainer_command} \'\\'\'{command}\'\\'\'" # Not in apptainer, so we can just use the escaped command as is else: - job_data.command = f"'{command}'" - job_data.command_printable += f"\'\\'\'{command}\'\\'\'" + job_command = f"'{command}'" + job_command_printable += f"\'\\'\'{command}\'\\'\'" - job_data.name = self.getHPCJobName(job) - job_data.num_procs = tester.getProcs(options) - job_data.num_threads = tester.getThreads(options) - job_data.walltime = str(datetime.timedelta(seconds=tester.getMaxTime())) + submission_env['COMMAND'] = job_command + submission_env['COMMAND_PRINTABLE'] = job_command_printable # The output files that we're expected to generate so that the # HPC job can add a terminator for them so that we can verify @@ -323,35 +323,75 @@ def submitJob(self, job): additional_output = [] for file in tester.getOutputFiles(options): additional_output.append(f'"{os.path.join(tester.getTestDir(), file)}"') - job_data.additional_output_files = ' '.join(additional_output) + submission_env['ADDITIONAL_OUTPUT_FILES'] = ' '.join(additional_output) + + # Let the derived scheduler add additional variables + self.augmentJobSubmission(submission_env) + + # Build the script + jinja_env = jinja2.Environment() + definition_template = jinja_env.from_string(self.submission_template) + jinja_env.trim_blocks = True + jinja_env.lstrip_blocks = True + script = definition_template.render(**submission_env) + + # Write the script + open(submission_script, 'w').write(script) + + # Submission command. Here we have a simple bash loop + # that will try to wait for the file if it doesn't exist yet + submission_command = self.getHPCSubmissionCommand() + cmd = [f'cd {tester.getTestDir()}', + f'FILE="{submission_script}"', + 'for i in {1..40}', + 'do if [ -e "$FILE" ]', + f'then {self.getHPCSubmissionCommand()} $FILE', + 'exit $?', + 'else sleep 0.25', + 'fi', + 'done', + 'exit 1'] + cmd = '; '.join(cmd) + + # Do the submission; this is thread safe + exit_code, result, full_cmd = self.callHPC(cmd) + + # Set what we've ran for this job so that we can + # potentially get the context in an error + tester.setCommandRan(full_cmd) + + # Nonzero return code + if exit_code != 0: + raise self.CallHPCException(self, f'{submission_command} failed', full_cmd, result) - # Let the derived class actually submit the job - job_id, submit_command = self._submitJob(job, job_data) + # Parse the job ID from the command + job_id = self.parseHPCSubmissionJobID(result) # Job has been submitted, so set it as queued - job.addCaveats(job_id) + # Here we append job_id if the ID is just a number so that it's more + # obvious what it is + job.addCaveats(f'job={job_id}' if job_id.isdigit() else job_id) self.setAndOutputJobStatus(job, job.queued, caveats=True) # Setup the job in the status map with self.hpc_jobs_lock: if job in self.hpc_jobs: raise Exception('Job has already been submitted') - self.hpc_jobs[job] = self.HPCJob(job_id, job_data.command) + self.hpc_jobs[job] = self.HPCJob(job_id, job_command) - return submit_command + return job_id, job_command - def _submitJob(self, job, job_data): + def augmentJobSubmission(self, submission_env): """ - Submits a given job. - - Should be overridden. This is where the derived classes - will specialize how to submit the job. + Entry point for derived schedulers to append to the + submission environment, which is used to populate + the submission jinja template. """ - raise Exception('Unimplemented createJobScript()') + return - def getHPCJob(self, job): + def getHPCJobStatus(self, job): """ - Gets the HPCJob object given a Job + Gets a job's status, which is None if it is pending or running This will periodically update statues given a timer. """ @@ -362,7 +402,13 @@ def getHPCJob(self, job): # Only update the statues periodically as this is called across threads if self.hpc_jobs_status_timer is None or ((clock() - self.hpc_jobs_status_timer) > self.hpc_jobs_update_interval): - success = self.updateJobs() + # Obtain the IDs of jobs that are active that we need to poll for + active_job_ids = [] + for job, hpc_job in self.hpc_jobs.items(): + if not hpc_job.done: + active_job_ids.append(hpc_job.id) + + success = self.updateJobs(active_job_ids) if not success: if self.update_jobs_failed: raise Exception('Failed to get job status') @@ -372,7 +418,7 @@ def getHPCJob(self, job): self.hpc_jobs_status_timer = clock() - return self.hpc_jobs.get(job) + return self.hpc_jobs.get(job).exit_code def updateJobs(self): """ @@ -398,6 +444,82 @@ def augmentJobs(self, jobs): if max_time == tester.getDefaultMaxTime(): tester.setMaxTime(max_time * 2) + def killJob(self, job): + """Kills a HPC job""" + with self.hpc_jobs_lock: + if job not in self.hpc_jobs: + return + hpc_job = self.hpc_jobs[job] + if hpc_job.done or hpc_job.killed: + return + job_id = self.hpc_jobs[job].id + + # Don't care about whether or not this failed + self.callHPC(f'{self.getHPCCancelCommand()} {job_id}') + + def killRemaining(self, keyboard=False): + """Kills all currently running HPC jobs""" + job_ids = [] + with self.hpc_jobs_lock: + for hpc_job in self.hpc_jobs.values(): + if not hpc_job.done: + job_ids.append(hpc_job.id) + + # Don't care about whether or not this failed + self.callHPC(f'{self.getHPCCancelCommand()} {" ".join(job_ids)}') + + with self.hpc_jobs_lock: + for hpc_job in self.hpc_jobs.values(): + if not hpc_job.done: + hpc_job.killed = True + + super().killRemaining(keyboard) + + def getHPCSchedulerName(self): + """ + Returns the name of the HPC scheduler in a simple shorthand. + + Used to produce files with a prefix of the scheduler type, i.e., + pbs_something or slurm_something. + + Should be overridden. + """ + raise Exception('Unimplemented getHPCSchedulerName()') + + def getHPCSubmissionCommand(self): + """ + Returns command used for submitting jobs. + + Should be overridden. + """ + raise Exception('Unimplemented getHPCSchedulerName()') + + def getHPCCancelCommand(self): + """ + Returns comamnd used for cancelling jobs. + + Should be overridden. + """ + raise Exception('Unimplemented getHPCCancelCommand()') + + def getHPCJobIDVariable(self): + """ + Returns the environment variable name that contains the job ID + when within a job (i.e., on a compute node). + + Should be overridden. + """ + raise Exception('Unimplemented getHPCJobIDVariable()') + + def parseHPCSubmissionJobID(self, result): + """ + Returns the job ID from the result of the submission command + (from qsub or sbatch). + + Should be overridden. + """ + raise Exception('Unimplemented parseHPCSubmissionJobID()') + @staticmethod def getHPCJobName(job) -> str: """Gets the name of the HPC job given a tester @@ -406,23 +528,27 @@ def getHPCJobName(job) -> str: """ return job.getTestName().replace(':', '.').replace('/', '.') - @staticmethod - def getHPCJobOutputPathPrefix(job): + def getHPCJobOutputPathPrefix(self, job): """Gets the absolute path prefix for a HPC job""" - return os.path.join(job.getTestDir(), "pbs_" + job.getTestNameShort().replace('/', '.')) + scheduler_name = self.getHPCSchedulerName() + return os.path.join(job.getTestDir(), f"{scheduler_name}_" + job.getTestNameShort().replace('/', '.')) - @staticmethod - def getHPCJobOutputPath(job): + def getHPCJobOutputPath(self, job): """Gets the absolute path for stdout/stderr for a HPC job""" - return RunHPC.getHPCJobOutputPathPrefix(job) + '.out' + return self.getHPCJobOutputPathPrefix(job) + '.out' - @staticmethod - def getHPCJobSubmissionPath(job): + def getHPCJobSubmissionPath(self, job): """Gets the aboslute path for the qsub script for a HPC job""" - return RunHPC.getHPCJobOutputPathPrefix(job) + '.qsub' + return self.getHPCJobOutputPathPrefix(job) + f'.{self.getHPCSubmissionCommand()}' - def getOutputEndingComment(self, job_id) -> str: - raise Exception('Unimplemented getOutputEndingComment()') + @staticmethod + def getOutputEndingComment(job_id) -> str: + """ + Get the ending comment that is applied to all output files + that are read in order to verify that the files are fully + synced when reading during postprocessing. + """ + return f'TESTHARNESS RUNHPC FILE TERMINATOR FOR {job_id}\n' @staticmethod def parseMPICommand(command) -> str: diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 5929815f77f1..e426127730ee 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -7,113 +7,21 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html -import os, re, json, socket, time -from RunParallel import RunParallel +import re, json from RunHPC import RunHPC from PBScodes import PBS_User_EXITCODES -import jinja2 ## This Class is responsible for maintaining an interface to the PBS scheduling syntax class RunPBS(RunHPC): """ - Scheduler for HPC jobs that run with PBS. + Scheduler class for the PBS HPC scheduler. """ - @staticmethod - def validParams(): - params = RunParallel.validParams() - params.addParam('queue_template', os.path.join(os.path.abspath(os.path.dirname(__file__)), 'pbs_template'), "Location of the PBS template") - return params - - def __init__(self, harness, params): - super().__init__(harness, params) - - # Load the PBS template - template_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'pbs_template') - self.default_template = open(template_path, 'r').read() - - def _submitJob(self, job, job_data): - tester = job.getTester() - options = self.options - - # Add MOOSE's python path for python scripts - moose_python = os.path.abspath(os.path.join(os.path.abspath(os.path.dirname(__file__)), '../..')) - - # Set up the template - template_env = {'NAME': job_data.name, - 'SELECT': f'{job_data.num_procs}:mpiprocs=1:ncpus={job_data.num_threads}', - 'WALLTIME': job_data.walltime, - 'PROJECT': self.options.pbs_project, - 'OUTPUT': job_data.output_file, - 'SUBMISSION_SCRIPT': job_data.submission_script, - 'PLACE': options.hpc_place, - 'TEST_SPEC': tester.getSpecFile(), - 'TEST_NAME': tester.getTestNameShort(), - 'SUBMITTED_HOSTNAME': socket.gethostname(), - 'CWD': tester.getTestDir(), - 'COMMAND': job_data.command, - 'COMMAND_PRINTABLE': job_data.command_printable, - 'ENDING_COMMENT': self.getOutputEndingComment('${PBS_JOBID}'), - 'MOOSE_PYTHONPATH': moose_python, - 'ADDITIONAL_OUTPUT_FILES': job_data.additional_output_files} + def augmentJobSubmission(self, submission_env): + submission_env['PLACE'] = self.options.hpc_place if self.options.hpc_queue: - template_env['QUEUE'] = options.hpc_queue - if self.options.hpc_pre_source: - template_env['SOURCE_FILE'] = options.hpc_pre_source - if self.source_contents: - template_env['SOURCE_CONTENTS'] = self.source_contents - - # Build the script - jinja_env = jinja2.Environment() - definition_template = jinja_env.from_string(self.default_template) - jinja_env.trim_blocks = True - jinja_env.lstrip_blocks = True - script = definition_template.render(**template_env) - - # Write the script - open(job_data.submission_script, 'w').write(script) - - # Submission command. Here we have a simple bash loop - # that will try to wait for the file if it doesn't exist yet - qsub_command = [f'cd {tester.getTestDir()}', - f'FILE="{job_data.submission_script}"', - 'for i in {1..40}', - 'do if [ -e "$FILE" ]', - 'then qsub $FILE', - 'exit $?', - 'else sleep 0.25', - 'fi', - 'done', - 'exit 1'] - qsub_command = '; '.join(qsub_command) - - # Do the submission; this is thread safe - # Eventually we might want to make this a pool so we can submit multiple - # jobs at the same time - exit_code, result, full_qsub_command = self.callHPC(qsub_command) - - # Set what we've ran for this job so that we can - # potentially get the context in an error - job.getTester().setCommandRan(full_qsub_command) - - # Nonzero return code - if exit_code != 0: - raise self.CallHPCException(self, 'qsub failed', qsub_command, result) - - # Make sure the job ID is something we'd expect - job_id = result - search = re.search('^[0-9]+.[a-zA-Z0-9_-]+$', job_id) - if not search: - raise self.CallHPCException(self, f'qsub has unexpected ID "{job_id}"', qsub_command) - - return job_id, full_qsub_command - - def updateJobs(self): - # Obtain the IDs of jobs that are active that we need to poll for - active_job_ids = [] - for job, pbs_job in self.hpc_jobs.items(): - if not pbs_job.done: - active_job_ids.append(pbs_job.id) + submission_env['QUEUE'] = self.options.hpc_queue + def updateJobs(self, active_job_ids): # Poll for all of the jobs within a single call cmd = ['qstat', '-xf', '-F', 'json'] + active_job_ids exit_code, result, _ = self.callHPC(' '.join(cmd)) @@ -125,82 +33,63 @@ def updateJobs(self): json_result = json.loads(result) job_results = json_result['Jobs'] - for job, pbs_job in self.hpc_jobs.items(): + for job, hpc_job in self.hpc_jobs.items(): # We're only updating jobs that aren't done yet - if pbs_job.done: + if hpc_job.done: continue # This job's result from the qstat command - job_result = job_results[pbs_job.id] + job_result = job_results[hpc_job.id] exit_code = job_result.get('Exit_status') if exit_code is not None: exit_code = int(exit_code) state = job_result.get('job_state') substate = job_result.get('substate') terminated = int(substate) == 91 if substate else False - done = exit_code is not None or terminated # Get the job state, and report running if it switched to running - if state == 'R' and not pbs_job.running: - pbs_job.running = True + if state == 'R' and not hpc_job.running: + hpc_job.running = True self.setAndOutputJobStatus(job, job.running, caveats=True) # If we were running but now we're done, we're not running anymore - if pbs_job.running and done: - pbs_job.running = False - - # Update the PBSJob structure - pbs_job.done = done - pbs_job.exit_code = exit_code - - # Negative exit code, means PBS killed it for some reason - # Try to find it in our pbs exit code list to return something useful - if exit_code is not None and exit_code < 0: - name_reason_tup = PBS_User_EXITCODES.get(exit_code) - if name_reason_tup is not None: - name, _ = name_reason_tup - job.setStatus(job.error, f'PBS ERROR: {name}') - else: - terminated = True - # Mark the job as terminated (past walltime, over resources, killed) - if terminated and not job.isFinished(): - job.setStatus(job.error, 'PBS JOB TERMINATED') + if exit_code is not None or terminated: + hpc_job.running = False + hpc_job.done = True + hpc_job.exit_code = exit_code if exit_code is not None else 1 + + # Negative exit code, means PBS killed it for some reason + # Try to find it in our pbs exit code list to return something useful + if exit_code < 0: + name_reason_tup = PBS_User_EXITCODES.get(exit_code) + if name_reason_tup is not None: + name, _ = name_reason_tup + job.setStatus(job.error, f'PBS ERROR: {name}') + else: + terminated = True + # Fallback for all other terminations + if terminated: + job.setStatus(job.error, 'PBS JOB TERMINATED') except Exception as e: raise self.CallHPCException(self, f'Failed to parse collective job status', cmd, result) from e # Success return True - def killJob(self, job): - """Kills a PBS job""" - with self.hpc_jobs_lock: - if job not in self.hpc_jobs: - return - hpc_job = self.hpc_jobs[job] - if hpc_job.done or hpc_job.killed: - return - job_id = self.hpc_jobs[job].id - - # Don't care about whether or not this failed - self.callHPC(f'qdel {job_id}') + def getHPCSchedulerName(self): + return 'pbs' - def killRemaining(self, keyboard=False): - """Kills all currently running PBS jobs""" - job_ids = [] - with self.hpc_jobs_lock: - for hpc_job in self.hpc_jobs.values(): - if not hpc_job.done: - job_ids.append(hpc_job.id) + def getHPCSubmissionCommand(self): + return 'qsub' - # Don't care about whether or not this failed - self.callHPC(f'qdel {" ".join(job_ids)}') + def getHPCCancelCommand(self): + return 'qdel' - with self.hpc_jobs_lock: - for hpc_job in self.hpc_jobs.values(): - if not hpc_job.done: - hpc_job.killed = True + def getHPCJobIDVariable(self): + return 'PBS_JOBID' - super().killRemaining(keyboard) - - def getOutputEndingComment(self, job_id): - return f'TESTHARNESS RUNPBS FILE TERMINATOR FOR {job_id}' + def parseHPCSubmissionJobID(self, result): + search = re.search('^[0-9]+.[a-zA-Z0-9_-]+$', result) + if not search: + raise Exception(f'qsub has unexpected ID {result}') + return result diff --git a/python/TestHarness/schedulers/RunSlurm.py b/python/TestHarness/schedulers/RunSlurm.py new file mode 100644 index 000000000000..037353529748 --- /dev/null +++ b/python/TestHarness/schedulers/RunSlurm.py @@ -0,0 +1,90 @@ +#* This file is part of the MOOSE framework +#* https://www.mooseframework.org +#* +#* All rights reserved, see COPYRIGHT for full restrictions +#* https://github.com/idaholab/moose/blob/master/COPYRIGHT +#* +#* Licensed under LGPL 2.1, please see LICENSE for details +#* https://www.gnu.org/licenses/lgpl-2.1.html + +import re +from RunHPC import RunHPC + +## This Class is responsible for maintaining an interface to the slurm scheduling syntax +class RunSlurm(RunHPC): + """ + Scheduler class for the slurm HPC scheduler. + """ + def __init__(self, harness, params): + super().__init__(harness, params) + + # Slurm is quite a bit faster at updating + self.hpc_jobs_update_interval = 5 + + def updateJobs(self, active_job_ids): + # Poll for all of the jobs within a single call + cmd = ['sacct', '-j', ','.join(active_job_ids), '--parsable2', '--noheader', '-o', 'jobid,exitcode,state,reason'] + exit_code, result, _ = self.callHPC(' '.join(cmd)) + if exit_code != 0: + return False + + # Attempt to parse the status from the jobs + try: + statuses = {} + for status in result.splitlines(): + # jobid,exitcode,state,reason are split by | + status_split = status.split('|') + # Slurm has sub jobs under each job, and we only care about the top-level job + id = status_split[0] + if not id.isdigit(): + continue + # exitcode is :, where the first value is the + # exit code of the process, the second is a slurm internal code + statuses[id] = {'exitcode': int(status_split[1].split(':')[0]), + 'state': status_split[2], + 'reason': status_split[3]} + + # Update the jobs that we can + for job, hpc_job in self.hpc_jobs.items(): + # We're only updating jobs that aren't done yet + if hpc_job.done: + continue + # Slurm jobs are sometimes not immediately available + status = statuses.get(hpc_job.id) + if status is None: + continue + + state = status['state'] + if state != 'PENDING' and not hpc_job.running: + hpc_job.running = True + self.setAndOutputJobStatus(job, job.running, caveats=True) + if hpc_job.running and state not in ['RUNNING', 'COMPLETING']: + hpc_job.running = False + hpc_job.done = True + hpc_job.exit_code = status['exitcode'] + if state not in ['FAILED', 'COMPLETED']: + job.setStatus(job.error, f'SLURM ERROR: {state}') + except Exception as e: + raise self.CallHPCException(self, f'Failed to parse collective job status', cmd, result) from e + + # Success + return True + + def getHPCSchedulerName(self): + return 'slurm' + + def getHPCSubmissionCommand(self): + return 'sbatch' + + def getHPCCancelCommand(self): + return 'scancel' + + def getHPCJobIDVariable(self): + return 'SLURM_JOB_ID' + + def parseHPCSubmissionJobID(self, result): + search = re.search('^Submitted batch job ([0-9]+)$', result) + if not search: + raise Exception(f'Failed to parse job ID from "{result}"') + return str(search.group(1)) + diff --git a/python/TestHarness/schedulers/pbs_template b/python/TestHarness/schedulers/hpc_template similarity index 77% rename from python/TestHarness/schedulers/pbs_template rename to python/TestHarness/schedulers/hpc_template index 382726cbc4d2..9cb55fe14d2c 100644 --- a/python/TestHarness/schedulers/pbs_template +++ b/python/TestHarness/schedulers/hpc_template @@ -1,6 +1,7 @@ #!/bin/bash +{%- if SCHEDULER_NAME == "pbs" %} #PBS -N {{ NAME }} -#PBS -l select={{ SELECT }} +#PBS -l select={{ NUM_PROCS }}:mpiprocs=1:ncpus={{ NUM_THREADS }} #PBS -l walltime={{ WALLTIME }} #PBS -P {{ PROJECT }} {%- if QUEUE is defined %} @@ -9,6 +10,14 @@ #PBS -j oe #PBS -o {{ OUTPUT }} #PBS -l place={{ PLACE }} +{%- elif SCHEDULER_NAME == "slurm" %} +#SBATCH --job-name={{ NAME }} +#SBATCH --ntasks={{ NUM_PROCS }} +#SBATCH --cpus-per-task={{ NUM_THREADS }} +#SBATCH --time={{ WALLTIME }} +#SBATCH --wckey={{ PROJECT }} +#SBATCH --output={{ OUTPUT }} +{%- endif %} # Exit on failure set -e @@ -23,7 +32,8 @@ export PYTHONPATH={{ MOOSE_PYTHONPATH }}:${PYTHONPATH} # Print a useful header echo "################################################################################" -echo "TestHarness RunPBS job on $(hostname) in job ${PBS_JOBID}" +echo "TestHarness {{ SCHEDULER_NAME }} job on $(hostname) in job ${{ JOB_ID_VARIABLE }}" +echo "################################################################################" echo "Time: $(date)" echo 'Test: {{ TEST_SPEC }}:{{ TEST_NAME }}' echo 'Directory: {{ CWD }}' @@ -32,8 +42,9 @@ echo 'Submitted hostname: {{ SUBMITTED_HOSTNAME }}' echo 'Submission script: {{ SUBMISSION_SCRIPT }}' echo 'Output: {{ OUTPUT }}' module list + echo "################################################################################" -echo "Beginning TestHarness RunHPC test execution" +echo "Beginning TestHarness {{ SCHEDULER_NAME }} test execution" echo "################################################################################" # Move into the test directory @@ -53,7 +64,7 @@ set -e # We will read this output later on to try to capture the return code # in the event that PBS doesn't get it to us correctly echo "################################################################################" -echo "Completed TestHarness RunHPC test execution; exit code = $return_code" +echo "Completed TestHarness {{ SCHEDULER_NAME }} test execution; exit code = $return_code" # Append a terminator to all of the output files for file syncing across NFS ADDITIONAL_OUTPUT_FILES=({{ ADDITIONAL_OUTPUT_FILES }}) From aa76f6b6b112f8eb6b2b8a004629ccf4500e0546 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 24 May 2024 08:10:40 -0600 Subject: [PATCH 129/243] Add more headers around output --- python/TestHarness/runners/HPCRunner.py | 7 +++++-- python/TestHarness/runners/Runner.py | 3 ++- python/TestHarness/runners/SubprocessRunner.py | 2 ++ python/TestHarness/schedulers/hpc_template | 5 ----- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index c37a31110a60..e704f722ccc9 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -141,13 +141,16 @@ def trySetOutput(self, required=False): output_file = self.run_hpc.getHPCJobOutputPath(self.job) if os.path.exists(output_file) and os.path.isfile(output_file): try: + header = f'{self.run_hpc.getHPCSchedulerName()} job {self.hpc_job_id} output' + self.output = util.outputHeader(f'Begin {header}') # If we're trying to parse output, we can't truncate it # because it might appear in the truncated portion if self.job.getTester().needFullOutput(self.options): - self.output = open(output_file, 'r').read() + self.output += open(output_file, 'r').read() # Not parsing the output, so just read it truncated else: - self.output = self.readTruncated(output_file) + self.output += self.readTruncated(output_file) + self.output += util.outputHeader(f'End {header}') did_set = True except: diff --git a/python/TestHarness/runners/Runner.py b/python/TestHarness/runners/Runner.py index 876e801ea737..7d54b954fa9e 100644 --- a/python/TestHarness/runners/Runner.py +++ b/python/TestHarness/runners/Runner.py @@ -60,13 +60,14 @@ def finalize(self): """ # Load the redirected output files, if any for file_path in self.job.getTester().getRedirectedOutputFiles(self.options): - self.output += util.outputHeader(f'Redirected output {file_path}') + self.output += util.outputHeader(f'Begin redirected output {file_path}') if os.access(file_path, os.R_OK): with open(file_path, 'r+b') as f: self.output += self.readOutput(f) else: self.job.setStatus(self.job.error, 'FILE TIMEOUT') self.output += 'FILE UNAVAILABLE\n' + self.output += util.outputHeader(f'End redirected output {file_path}') # Check for invalid unicode in output try: diff --git a/python/TestHarness/runners/SubprocessRunner.py b/python/TestHarness/runners/SubprocessRunner.py index 216dc3cd1f55..4d8639b0e4be 100644 --- a/python/TestHarness/runners/SubprocessRunner.py +++ b/python/TestHarness/runners/SubprocessRunner.py @@ -79,10 +79,12 @@ def wait(self, timer): self.exit_code = self.process.poll() + self.output = util.outputHeader('Begin combined stdout+stderr output') for file in [self.outfile, self.errfile]: file.flush() self.output += self.readOutput(file) file.close() + self.output += util.outputHeader('End combined stderr+stdout output') def kill(self): if self.process is not None: diff --git a/python/TestHarness/schedulers/hpc_template b/python/TestHarness/schedulers/hpc_template index 9cb55fe14d2c..da80a61daad5 100644 --- a/python/TestHarness/schedulers/hpc_template +++ b/python/TestHarness/schedulers/hpc_template @@ -31,9 +31,7 @@ set -e export PYTHONPATH={{ MOOSE_PYTHONPATH }}:${PYTHONPATH} # Print a useful header -echo "################################################################################" echo "TestHarness {{ SCHEDULER_NAME }} job on $(hostname) in job ${{ JOB_ID_VARIABLE }}" -echo "################################################################################" echo "Time: $(date)" echo 'Test: {{ TEST_SPEC }}:{{ TEST_NAME }}' echo 'Directory: {{ CWD }}' @@ -45,7 +43,6 @@ module list echo "################################################################################" echo "Beginning TestHarness {{ SCHEDULER_NAME }} test execution" -echo "################################################################################" # Move into the test directory cd {{ CWD }} @@ -84,8 +81,6 @@ for file in ${ADDITIONAL_OUTPUT_FILES[@]}; do echo "Finalized output $file" done -echo "################################################################################" - # Append a recognizable string at the end of the output. We look # for this string when parsing the output so that we can be sure # that we have obtained all of the output From 83ccffc39d8d89a8d557a4c8b101c3de7814d6b8 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 24 May 2024 08:11:05 -0600 Subject: [PATCH 130/243] Better parse PBS exits --- python/TestHarness/schedulers/RunPBS.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index e426127730ee..47f4ff77cf89 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -10,6 +10,7 @@ import re, json from RunHPC import RunHPC from PBScodes import PBS_User_EXITCODES +from TestHarness import util ## This Class is responsible for maintaining an interface to the PBS scheduling syntax class RunPBS(RunHPC): @@ -44,8 +45,6 @@ def updateJobs(self, active_job_ids): if exit_code is not None: exit_code = int(exit_code) state = job_result.get('job_state') - substate = job_result.get('substate') - terminated = int(substate) == 91 if substate else False # Get the job state, and report running if it switched to running if state == 'R' and not hpc_job.running: @@ -53,23 +52,19 @@ def updateJobs(self, active_job_ids): self.setAndOutputJobStatus(job, job.running, caveats=True) # If we were running but now we're done, we're not running anymore - if exit_code is not None or terminated: + if exit_code is not None: hpc_job.running = False hpc_job.done = True - hpc_job.exit_code = exit_code if exit_code is not None else 1 + hpc_job.exit_code = exit_code - # Negative exit code, means PBS killed it for some reason - # Try to find it in our pbs exit code list to return something useful + # Negative exit code, means PBS set a reason if exit_code < 0: - name_reason_tup = PBS_User_EXITCODES.get(exit_code) - if name_reason_tup is not None: - name, _ = name_reason_tup - job.setStatus(job.error, f'PBS ERROR: {name}') - else: - terminated = True - # Fallback for all other terminations - if terminated: - job.setStatus(job.error, 'PBS JOB TERMINATED') + name, reason = PBS_User_EXITCODES.get(exit_code, ('TERMINATED', 'Unknown reason')) + job.setStatus(job.error, f'PBS ERROR: {name}') + job.appendOutput(util.outputHeader(f'PBS terminated job with reason: {reason}')) + # Job was killed with a signal + elif exit_code >= 128: + job.setStatus(job.error, f'PBS JOB KILLED') except Exception as e: raise self.CallHPCException(self, f'Failed to parse collective job status', cmd, result) from e From 0cfcb9535773d0c7b60d150144ba89b4bd8276ac Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 24 May 2024 08:21:00 -0600 Subject: [PATCH 131/243] Automatically determine the HPC scheduler if we can --- python/TestHarness/TestHarness.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 33b41c88b726..2eb4fc5bb9f8 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -895,18 +895,34 @@ def writeResults(self): print('Error while writing results to disc') sys.exit(1) + def determineScheduler(self): + # Try to figure out a HPC scheduler if we can + hpc = self.options.hpc + hpc_host = self.options.hpc_host + if hpc_host and not hpc: + if 'sawtooth' in hpc_host or 'lemhi' in hpc_host: + hpc = 'pbs' + elif 'bitterroot' in hpc_host: + hpc = 'slurm' + if hpc: + print(f'INFO: Setting --hpc={hpc} for known host {hpc_host}') + else: + print(f'ERROR: --hpc must be set with --hpc-host for an unknown host') + sys.exit(1) + + if hpc == 'pbs': + return 'RunPBS' + elif hpc == 'slurm': + return 'RunSlurm' + # The default scheduler plugin + return 'RunParallel' + def initialize(self, argv, app_name): # Load the scheduler plugins plugin_paths = [os.path.join(self.moose_dir, 'python', 'TestHarness'), os.path.join(self.moose_dir, 'share', 'moose', 'python', 'TestHarness')] self.factory.loadPlugins(plugin_paths, 'schedulers', "IS_SCHEDULER") - if self.options.hpc == 'pbs': - scheduler_plugin = 'RunPBS' - elif self.options.hpc == 'slurm': - scheduler_plugin = 'RunSlurm' - # The default scheduler plugin - else: - scheduler_plugin = 'RunParallel' + scheduler_plugin = self.determineScheduler() # Augment the Scheduler params with plugin params plugin_params = self.factory.validParams(scheduler_plugin) From ea60c6114b6a7ed34f01e156292839f0decc9705 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 24 May 2024 08:44:01 -0600 Subject: [PATCH 132/243] Simplify unavailable output output --- python/TestHarness/runners/HPCRunner.py | 2 +- python/TestHarness/util.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index e704f722ccc9..a18f5a42dc24 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -115,7 +115,7 @@ def wait(self, timer): self.trySetOutput() def print_files(files, type): if files: - self.output += util.outputHeader(f'{type} output file(s)') + self.output += util.outputHeader(f'{type} output file(s):', ending=False) self.output += '\n'.join(files) + '\n' print_files(wait_files, 'Unavailable') print_files(incomplete_files, 'Incomplete') diff --git a/python/TestHarness/util.py b/python/TestHarness/util.py index 17474c35dade..5e5c119a9a35 100644 --- a/python/TestHarness/util.py +++ b/python/TestHarness/util.py @@ -891,12 +891,13 @@ def trimOutput(job, options): "#"*80, output[-second_part:]) -def outputHeader(header): +def outputHeader(header, ending=True): """ Returns text for output with a visual separator, i.e.: ##############################...
##############################... """ - sep = '#' * 80 - return f'{sep}\n{header}\n{sep}\n' + begin_sep = '#' * 80 + end_sep = f'{begin_sep}\n' if ending else '' + return f'{begin_sep}\n{header}\n{end_sep}' From 736c60737dd4a7b33abcf19b5c38c45ba6d97700 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 24 May 2024 09:29:40 -0600 Subject: [PATCH 133/243] Cleanup host assumption --- python/TestHarness/TestHarness.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 2eb4fc5bb9f8..3a402b6663ce 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -896,23 +896,13 @@ def writeResults(self): sys.exit(1) def determineScheduler(self): - # Try to figure out a HPC scheduler if we can - hpc = self.options.hpc - hpc_host = self.options.hpc_host - if hpc_host and not hpc: - if 'sawtooth' in hpc_host or 'lemhi' in hpc_host: - hpc = 'pbs' - elif 'bitterroot' in hpc_host: - hpc = 'slurm' - if hpc: - print(f'INFO: Setting --hpc={hpc} for known host {hpc_host}') - else: - print(f'ERROR: --hpc must be set with --hpc-host for an unknown host') - sys.exit(1) + if self.options.hpc_host and not self.options.hpc: + print(f'ERROR: --hpc must be set with --hpc-host for an unknown host') + sys.exit(1) - if hpc == 'pbs': + if self.options.hpc == 'pbs': return 'RunPBS' - elif hpc == 'slurm': + elif self.options.hpc == 'slurm': return 'RunSlurm' # The default scheduler plugin return 'RunParallel' @@ -1113,6 +1103,16 @@ def parseCLArgs(self, argv): self.options = parser.parse_args(argv[1:]) self.options.code = code + # Try to guess the --hpc option if --hpc-host is set + if self.options.hpc_host and not self.options.hpc: + hpc_host = self.options.hpc_host[0] + if 'sawtooth' in hpc_host or 'lemhi' in hpc_host: + self.options.hpc = 'pbs' + elif 'bitterroot' in hpc_host: + self.options.hpc = 'slurm' + if self.options.hpc: + print(f'INFO: Setting --hpc={self.options.hpc} for known host {hpc_host}') + self.options.runtags = [tag for tag in self.options.run.split(',') if tag != ''] # Convert all list based options of length one to scalars From b93451a1ddc1e6a0e3d95c73694ac88ac4f7df17 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 24 May 2024 09:58:31 -0600 Subject: [PATCH 134/243] Remove unused import --- python/TestHarness/runners/HPCRunner.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index a18f5a42dc24..5ff14118cb8c 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -42,9 +42,6 @@ def spawn(self, timer): timer.start() def wait(self, timer): - # Need to import here to avoid cyclic includes - from TestHarness.schedulers.RunHPC import RunHPC - # Poll loop waiting for the job to be finished # This gets a structure that represents the job, and the # polling itself is only done on occasion within RunHPC From 7428881c2406f971ddeebeb5e1c146693e1afd28 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 24 May 2024 09:58:50 -0600 Subject: [PATCH 135/243] Move HPCJob and remove unused variables --- python/TestHarness/schedulers/RunHPC.py | 39 ++++++++++++------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index cccaea8b47dc..95ac925dd0fd 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -14,6 +14,23 @@ from multiprocessing.pool import ThreadPool from timeit import default_timer as clock +class HPCJob: + """ + Structure that represents the cached information about an HPC job + """ + def __init__(self, id, command): + # The job identifier + self.id = id + # Whether or not this job is done; here done doesn't mean if it + # was successful or not, just if it is not running/queued anymore + self.done = False + # The exit code of the command that was ran (if any) + self.exit_code = None + # Whether or not this job was killed; used so what we don't + # bother killing a job multiple times + self.killed = False + # Whether or not the job is currently running + self.running = False class RunHPC(RunParallel): """ Base scheduler for jobs that are ran on HPC. @@ -116,26 +133,6 @@ def __init__(self, harness, params): template_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'hpc_template') self.submission_template = open(template_path, 'r').read() - class HPCJob: - """ - Structure that represents the cached information about an HPC job - """ - def __init__(self, id, command): - # The job identifier - self.id = id - # Whether or not this job is done; here done doesn't mean if it - # was successful or not, just if it is not running/queued anymore - self.done = False - # The exit code of the command that was ran (if any) - self.exit_code = None - # The command that was ran within the submission script - self.command = command - # Whether or not this job was killed; used so what we don't - # bother killing a job multiple times - self.killed = False - # Whether or not the job is currently running - self.running = False - class CallHPCException(Exception): """ Exception class for providing extra context for HPC submission errors @@ -377,7 +374,7 @@ def submitJob(self, job): with self.hpc_jobs_lock: if job in self.hpc_jobs: raise Exception('Job has already been submitted') - self.hpc_jobs[job] = self.HPCJob(job_id, job_command) + self.hpc_jobs[job] = HPCJob(job_id, job_command) return job_id, job_command From 276688e1fee50f90394d9515cc8a646c9c299759 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 24 May 2024 09:59:04 -0600 Subject: [PATCH 136/243] Simplify job killer --- python/TestHarness/schedulers/RunHPC.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 95ac925dd0fd..ffed8fc4b582 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -444,12 +444,11 @@ def augmentJobs(self, jobs): def killJob(self, job): """Kills a HPC job""" with self.hpc_jobs_lock: - if job not in self.hpc_jobs: - return - hpc_job = self.hpc_jobs[job] - if hpc_job.done or hpc_job.killed: + hpc_job = self.hpc_jobs.get(job) + if hpc_job is None or hpc_job.done or hpc_job.killed: return - job_id = self.hpc_jobs[job].id + job_id = hpc_job.id + hpc_job.killed = True # Don't care about whether or not this failed self.callHPC(f'{self.getHPCCancelCommand()} {job_id}') @@ -461,15 +460,11 @@ def killRemaining(self, keyboard=False): for hpc_job in self.hpc_jobs.values(): if not hpc_job.done: job_ids.append(hpc_job.id) + hpc_job.killed = True # Don't care about whether or not this failed self.callHPC(f'{self.getHPCCancelCommand()} {" ".join(job_ids)}') - with self.hpc_jobs_lock: - for hpc_job in self.hpc_jobs.values(): - if not hpc_job.done: - hpc_job.killed = True - super().killRemaining(keyboard) def getHPCSchedulerName(self): From da6708705cbda0f3aa5fb95588f75546118cca52 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 24 May 2024 09:59:28 -0600 Subject: [PATCH 137/243] Rename PBS to HPC --- python/TestHarness/testers/RunException.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/testers/RunException.py b/python/TestHarness/testers/RunException.py index 6a6aa5c1d36e..cb8c59e8b2a3 100644 --- a/python/TestHarness/testers/RunException.py +++ b/python/TestHarness/testers/RunException.py @@ -39,7 +39,7 @@ def checkRunnable(self, options): # "Inappropriate ioctl for device (25)" errors, so if this test # requires more procs, we can't run it if options.hpc and int(self.specs['min_parallel'] > 1): - self.addCaveats('PBS max_cpus=1') + self.addCaveats('hpc max_cpus=1') return False return RunApp.checkRunnable(self, options) From 86f134435d2e84598c323164e9b757eda9f92e72 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 24 May 2024 12:04:30 -0600 Subject: [PATCH 138/243] Rewrite RunHPC to introduce more thread safety --- python/TestHarness/runners/HPCRunner.py | 18 +- python/TestHarness/schedulers/Job.py | 2 +- python/TestHarness/schedulers/RunHPC.py | 182 ++++++++++++++++----- python/TestHarness/schedulers/RunPBS.py | 62 +++---- python/TestHarness/schedulers/RunSlurm.py | 83 +++++----- python/TestHarness/schedulers/Scheduler.py | 50 +++--- 6 files changed, 242 insertions(+), 155 deletions(-) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index 5ff14118cb8c..cc04f0ef08ac 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -21,10 +21,8 @@ def __init__(self, job, options, run_hpc): # The RunHPC object self.run_hpc = run_hpc - # The HPC job id, used for the file terminator - self.hpc_job_id = None - # The command ran in the HPC job, used to set later - self.hpc_job_command = None + # The HPC job, set during spawn() + self.hpc_job = None # Interval in seconds for polling for job status self.job_status_poll_time = 0.1 @@ -37,7 +35,7 @@ def __init__(self, job, options, run_hpc): def spawn(self, timer): # Rely on the RunHPC object to submit the job - self.hpc_job_id, self.hpc_job_command = self.run_hpc.submitJob(self.job) + self.hpc_job = self.run_hpc.submitJob(self.job) timer.start() @@ -47,7 +45,7 @@ def wait(self, timer): # polling itself is only done on occasion within RunHPC while True: time.sleep(self.job_status_poll_time) - self.exit_code = self.run_hpc.getHPCJobStatus(self.job) + self.exit_code = self.hpc_job.getExitCode() # We're done if self.exit_code is not None: @@ -72,10 +70,6 @@ def wait(self, timer): tester = self.job.getTester() - # We've actually ran something now and not just qsub, so update the - # command to what was ran there - tester.setCommandRan(self.hpc_job_command) - # Determine the output files that we need to wait for to be complete wait_files = set([output_file]) # Output files needed by the Tester, only if it says we should @@ -138,7 +132,7 @@ def trySetOutput(self, required=False): output_file = self.run_hpc.getHPCJobOutputPath(self.job) if os.path.exists(output_file) and os.path.isfile(output_file): try: - header = f'{self.run_hpc.getHPCSchedulerName()} job {self.hpc_job_id} output' + header = f'{self.run_hpc.getHPCSchedulerName()} job {self.hpc_job.id} output' self.output = util.outputHeader(f'Begin {header}') # If we're trying to parse output, we can't truncate it # because it might appear in the truncated portion @@ -181,7 +175,7 @@ def fileIsReady(self, file): if is_binary is None: return False - ending_comment = self.run_hpc.getOutputEndingComment(self.hpc_job_id) + ending_comment = self.run_hpc.getOutputEndingComment(self.hpc_job.id) # Binary file if is_binary: diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 9f1c9f06ac2c..e1fb54209d46 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -283,7 +283,7 @@ def run(self): def try_catch(do, exception_name): try: do() - except: + except Exception: self.cleanup() self.setStatus(self.error, f'{exception_name} EXCEPTION') self.output += util.outputHeader('Python exception encountered') diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index ffed8fc4b582..ec6a0656ee36 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -8,7 +8,7 @@ #* https://www.gnu.org/licenses/lgpl-2.1.html from RunParallel import RunParallel -import threading, os, re, sys, datetime, shlex, socket +import threading, os, re, sys, datetime, shlex, socket, threading, time import paramiko import jinja2 from multiprocessing.pool import ThreadPool @@ -18,9 +18,13 @@ class HPCJob: """ Structure that represents the cached information about an HPC job """ - def __init__(self, id, command): - # The job identifier + def __init__(self, job, id, command): + # The underlying Job (only set on init, _should_ be thread safe) + self.job = job + # The job identifier (only set on init, _should_ be thread safe) self.id = id + # The command that was ran within the job + self.command = command # Whether or not this job is done; here done doesn't mean if it # was successful or not, just if it is not running/queued anymore self.done = False @@ -31,6 +35,55 @@ def __init__(self, id, command): self.killed = False # Whether or not the job is currently running self.running = False + # Lock for accessing this object + self.lock = threading.Lock() + + def getLock(self): + """ + Gets the lock for this object. + """ + return self.lock + + def set(self, **kwargs): + """ + Thread-safe setter. + """ + with self.getLock(): + for key, value in kwargs.items(): + setattr(self, key, value) + + def getExitCode(self): + """ + Gets the thread-safe exit code. + + This exit code is what is read by the HPCRunner, + which means that it needs to be locked as we're + also updating it at the same time. + """ + with self.getLock(): + return self.exit_code + + def getRunning(self): + """ + Gets the thread-safe running state. + """ + with self.getLock(): + return self.running + + def getKilled(self): + """ + Gets the thread-safe killed state. + """ + with self.getLock(): + return self.killed + + def getDone(self): + """ + Gets the thread-safe done state. + """ + with self.getLock(): + return self.done + class RunHPC(RunParallel): """ Base scheduler for jobs that are ran on HPC. @@ -51,16 +104,13 @@ def __init__(self, harness, params): # Lock for accessing self.hpc_jobs self.hpc_jobs_lock = threading.Lock() - # The last time statues were updated in getHPCJob() (if any) - self.hpc_jobs_status_timer = None # How often to poll for status updates in getHPCJob() self.hpc_jobs_update_interval = 10 # Map of Job -> HPCJob self.hpc_jobs = {} + # The thread that will update the HPCJobs + self.hpc_jobs_updater = None - # Whether or not the last job status command failed. We let it - # fail once if it passes the second time for some redundancy - self.update_jobs_failed = False # The jump hostname for running commands, if any self.ssh_hosts = self.options.hpc_host # The SSH key to use for connections @@ -374,9 +424,18 @@ def submitJob(self, job): with self.hpc_jobs_lock: if job in self.hpc_jobs: raise Exception('Job has already been submitted') - self.hpc_jobs[job] = HPCJob(job_id, job_command) + hpc_job = HPCJob(job, job_id, job_command) + self.hpc_jobs[job] = hpc_job + + # If the updater hasn't been started yet, start it. + # We do this here because it's locked within hpc_jobs_lock + # and it means that we won't start looking for jobs until + # we have at least one job + if not self.hpc_jobs_updater: + self.hpc_jobs_updater = threading.Thread(target=self._updateHPCJobs) + self.hpc_jobs_updater.start() - return job_id, job_command + return hpc_job def augmentJobSubmission(self, submission_env): """ @@ -386,45 +445,80 @@ def augmentJobSubmission(self, submission_env): """ return - def getHPCJobStatus(self, job): + def _updateHPCJobs(self): """ - Gets a job's status, which is None if it is pending or running - - This will periodically update statues given a timer. + Function that is called in a separate thread to update the job + status given some interval. """ - with self.hpc_jobs_lock: - # If this is the first time seeing this job, initialize it in the list - if job not in self.hpc_jobs: - raise Exception('Failed to get status for unsubmitted job') - - # Only update the statues periodically as this is called across threads - if self.hpc_jobs_status_timer is None or ((clock() - self.hpc_jobs_status_timer) > self.hpc_jobs_update_interval): - # Obtain the IDs of jobs that are active that we need to poll for - active_job_ids = [] - for job, hpc_job in self.hpc_jobs.items(): - if not hpc_job.done: - active_job_ids.append(hpc_job.id) + # We want to allow failure to happen once, just not twice in a row. + # This is a good sanity check for when occasionally the login + # node doesn't respod as expected + update_jobs_failed = False + + try: + while True: + with self.hpc_jobs_lock: + active_hpc_jobs = [x for x in self.hpc_jobs.values() if not x.done] + if active_hpc_jobs: + success = self.updateHPCJobs(active_hpc_jobs) + if not success: + if update_jobs_failed: + self.triggerErrorState() + print('ERROR: Failed to get HPC job status') + return + update_jobs_failed = True + else: + update_jobs_failed = False + + # Update on the interval requested, but also make sure + # that we're still running + poll_time = 0.1 + for i in range(int(self.hpc_jobs_update_interval / poll_time)): + if not self.isRunning(): + return + time.sleep(poll_time) + except: + self.triggerErrorState() + raise + + def updateHPCJobs(self, active_hpc_jobs): + """ + Updates the underlying jobs. - success = self.updateJobs(active_job_ids) - if not success: - if self.update_jobs_failed: - raise Exception('Failed to get job status') - self.update_jobs_failed = True - else: - self.update_jobs_failed = False + Should be overridden and should return True or False + depending on whether or not the update succeeded. - self.hpc_jobs_status_timer = clock() + Should use setHPCJobRunning() and setHPCJobDone() + to trigger changes in HPC job state. + """ + raise Exception('Unimplemented updateHPCJobs()') - return self.hpc_jobs.get(job).exit_code + def setHPCJobRunning(self, hpc_job): + """ + Sets the given HPC job as running. - def updateJobs(self): + This should be called within the overridden updateHPCJobs(). """ - Updates the underlying jobs. + # This is currently thread safe because we only ever change + # it within updateJobs(), which is only ever executed serially + # within the thread the calls _updateHPCJobs() + hpc_job.set(running=True) + # Print out that the job is now running + self.setAndOutputJobStatus(hpc_job.job, hpc_job.job.running, caveats=True) + + def setHPCJobDone(seflf, hpc_job, exit_code): + """ + Sets the given HPC job as done. - Should be overridden and should return True or False - depending on whether or not the update succeeded. + This should be called within the overridden updateHPCJobs(). """ - raise Exception('Unimplemented updateJobs()') + hpc_job.set(running=False, done=True, exit_code=exit_code) + + # We've actually ran something now that didn't fail, so update + # the command to what was ran there + job = hpc_job.job + if not job.isError(): + job.getTester().setCommandRan(hpc_job.command) def buildRunner(self, job, options): from TestHarness.runners.HPCRunner import HPCRunner @@ -445,10 +539,10 @@ def killJob(self, job): """Kills a HPC job""" with self.hpc_jobs_lock: hpc_job = self.hpc_jobs.get(job) - if hpc_job is None or hpc_job.done or hpc_job.killed: + if hpc_job is None or hpc_job.getDone() or hpc_job.getKilled(): return job_id = hpc_job.id - hpc_job.killed = True + hpc_job.set(killed=True) # Don't care about whether or not this failed self.callHPC(f'{self.getHPCCancelCommand()} {job_id}') @@ -458,9 +552,9 @@ def killRemaining(self, keyboard=False): job_ids = [] with self.hpc_jobs_lock: for hpc_job in self.hpc_jobs.values(): - if not hpc_job.done: + if not hpc_job.getDone() and not hpc_job.getKilled(): job_ids.append(hpc_job.id) - hpc_job.killed = True + hpc_job.set(killed=True) # Don't care about whether or not this failed self.callHPC(f'{self.getHPCCancelCommand()} {" ".join(job_ids)}') diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 47f4ff77cf89..7f65f563d65c 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -22,51 +22,43 @@ def augmentJobSubmission(self, submission_env): if self.options.hpc_queue: submission_env['QUEUE'] = self.options.hpc_queue - def updateJobs(self, active_job_ids): + def updateJobs(self, active_hpc_jobs): # Poll for all of the jobs within a single call + active_job_ids = ','.join([x.id for x in active_hpc_jobs]) cmd = ['qstat', '-xf', '-F', 'json'] + active_job_ids exit_code, result, _ = self.callHPC(' '.join(cmd)) if exit_code != 0: return False - # Attempt to parse the status from the jobs - try: - json_result = json.loads(result) - job_results = json_result['Jobs'] + # Parse the status from the jobs + json_result = json.loads(result) + job_results = json_result['Jobs'] - for job, hpc_job in self.hpc_jobs.items(): - # We're only updating jobs that aren't done yet - if hpc_job.done: - continue + for hpc_job in active_hpc_jobs: + # This job's result from the qstat command + job_result = job_results[hpc_job.id] + exit_code = job_result.get('Exit_status') + if exit_code is not None: + exit_code = int(exit_code) + state = job_result.get('job_state') - # This job's result from the qstat command - job_result = job_results[hpc_job.id] - exit_code = job_result.get('Exit_status') - if exit_code is not None: - exit_code = int(exit_code) - state = job_result.get('job_state') + # Get the job state, and report running if it switched to running + if state == 'R' and not hpc_job.getRunning(): + self.setHPCJobRunning(hpc_job) - # Get the job state, and report running if it switched to running - if state == 'R' and not hpc_job.running: - hpc_job.running = True - self.setAndOutputJobStatus(job, job.running, caveats=True) + # If we were running but now we're done, we're not running anymore + if exit_code is not None: + job = hpc_job.job + # Negative exit code, means PBS set a reason + if exit_code < 0: + name, reason = PBS_User_EXITCODES.get(exit_code, ('TERMINATED', 'Unknown reason')) + job.setStatus(job.error, f'PBS ERROR: {name}') + job.appendOutput(util.outputHeader(f'PBS terminated job with reason: {reason}')) + # Job was killed with a signal + elif exit_code >= 128: + job.setStatus(job.error, f'PBS JOB KILLED') - # If we were running but now we're done, we're not running anymore - if exit_code is not None: - hpc_job.running = False - hpc_job.done = True - hpc_job.exit_code = exit_code - - # Negative exit code, means PBS set a reason - if exit_code < 0: - name, reason = PBS_User_EXITCODES.get(exit_code, ('TERMINATED', 'Unknown reason')) - job.setStatus(job.error, f'PBS ERROR: {name}') - job.appendOutput(util.outputHeader(f'PBS terminated job with reason: {reason}')) - # Job was killed with a signal - elif exit_code >= 128: - job.setStatus(job.error, f'PBS JOB KILLED') - except Exception as e: - raise self.CallHPCException(self, f'Failed to parse collective job status', cmd, result) from e + self.setHPCJobDone(hpc_job, exit_code) # Success return True diff --git a/python/TestHarness/schedulers/RunSlurm.py b/python/TestHarness/schedulers/RunSlurm.py index 037353529748..56ee72e8dd98 100644 --- a/python/TestHarness/schedulers/RunSlurm.py +++ b/python/TestHarness/schedulers/RunSlurm.py @@ -21,51 +21,54 @@ def __init__(self, harness, params): # Slurm is quite a bit faster at updating self.hpc_jobs_update_interval = 5 - def updateJobs(self, active_job_ids): + def updateHPCJobs(self, active_hpc_jobs): # Poll for all of the jobs within a single call - cmd = ['sacct', '-j', ','.join(active_job_ids), '--parsable2', '--noheader', '-o', 'jobid,exitcode,state,reason'] + active_job_ids = ','.join([x.id for x in active_hpc_jobs]) + cmd = ['sacct', '-j', active_job_ids, '--parsable2', '--noheader', + '-o', 'jobid,exitcode,state,reason'] exit_code, result, _ = self.callHPC(' '.join(cmd)) if exit_code != 0: return False - # Attempt to parse the status from the jobs - try: - statuses = {} - for status in result.splitlines(): - # jobid,exitcode,state,reason are split by | - status_split = status.split('|') - # Slurm has sub jobs under each job, and we only care about the top-level job - id = status_split[0] - if not id.isdigit(): - continue - # exitcode is :, where the first value is the - # exit code of the process, the second is a slurm internal code - statuses[id] = {'exitcode': int(status_split[1].split(':')[0]), - 'state': status_split[2], - 'reason': status_split[3]} - - # Update the jobs that we can - for job, hpc_job in self.hpc_jobs.items(): - # We're only updating jobs that aren't done yet - if hpc_job.done: - continue - # Slurm jobs are sometimes not immediately available - status = statuses.get(hpc_job.id) - if status is None: - continue - - state = status['state'] - if state != 'PENDING' and not hpc_job.running: - hpc_job.running = True - self.setAndOutputJobStatus(job, job.running, caveats=True) - if hpc_job.running and state not in ['RUNNING', 'COMPLETING']: - hpc_job.running = False - hpc_job.done = True - hpc_job.exit_code = status['exitcode'] - if state not in ['FAILED', 'COMPLETED']: - job.setStatus(job.error, f'SLURM ERROR: {state}') - except Exception as e: - raise self.CallHPCException(self, f'Failed to parse collective job status', cmd, result) from e + # Parse the status from the jobs + statuses = {} + for status in result.splitlines(): + # jobid,exitcode,state,reason are split by | + status_split = status.split('|') + # Slurm has sub jobs under each job, and we only care about the top-level job + id = status_split[0] + if not id.isdigit(): + continue + # exitcode is :, where the first value is the + # exit code of the process, the second is a slurm internal code + statuses[id] = {'exitcode': int(status_split[1].split(':')[0]), + 'state': status_split[2], + 'reason': status_split[3]} + + # Update the jobs that we can + for hpc_job in active_hpc_jobs: + # Slurm jobs are sometimes not immediately available + status = statuses.get(hpc_job.id) + if status is None: + continue + + # The slurm job state; see slurm.schedmd.com/squeue.html#lbAG + state = status['state'] + + # Job wasn't running and it's no longer pending, so it + # is running or has at least ran + if state != 'PENDING' and not hpc_job.getRunning(): + self.setHPCJobRunning(hpc_job) + + # Job was running and isn't running anymore, so it's done + if hpc_job.getRunning() and state not in ['RUNNING', 'COMPLETING']: + # If a job COMPLETED, it's done with exit code 0 so everything + # went well. If it FAILED, it finished but returned with a + # non-zero exit code, which will be handled by the Tester. + if state not in ['FAILED', 'COMPLETED']: + hpc_job.job.setStatus(hpc_job.job.error, f'SLURM ERROR: {state}') + + self.setHPCJobDone(hpc_job, int(status['exitcode'])) # Success return True diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index 753fe7b2fb5a..0bf800bb34c7 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -117,6 +117,13 @@ def __init__(self, harness, params): # Whether or not to enforce the timeout of jobs self.enforce_timeout = True + def getErrorState(self): + """ + Gets the thread-safe error state. + """ + with self.__error_state_lock: + return self.__error_state + def availableSlots(self, params): """ Get the number of available slots for processing jobs and @@ -157,9 +164,7 @@ def retrieveDAGs(self): def schedulerError(self): """ boolean if the scheduler prematurely exited """ - with self.__error_state_lock: - error_state = self.__error_state - return error_state and not self.maxFailures() + return self.getErrorState() and not self.maxFailures() def maxFailures(self): """ Boolean for hitting max failures """ @@ -171,10 +176,6 @@ def run(self, job): """ Call derived run method """ return - def notifyFinishedSchedulers(self): - """ Notify derived schedulers we are finished """ - return - def augmentJobs(self, jobs): """ Allow derived schedulers to augment jobs before they perform work. @@ -201,6 +202,17 @@ def setAndOutputJobStatus(self, job, status, caveats=None): job.force_report_status = True self.handleJobStatus(job, caveats=caveats) + def isRunning(self): + """ + Returns whether or not the scheduler is currently running jobs. + """ + if self.getErrorState(): + return False + with self.__bank_lock: + if not self.__job_bank: + return False + return True + def waitFinish(self): """ Inform the Scheduler to begin running. Block until all jobs finish. @@ -211,15 +223,12 @@ def waitFinish(self): # wait until there is an error, or job_bank has emptied while True: - with self.__bank_lock: - if not self.__job_bank: - break - with self.__error_state_lock: - error_state = self.__error_state - if error_state: - break + if not self.isRunning(): + break sleep(0.1) + error_state = self.getErrorState() + # Completed all jobs sanity check if not error_state and self.__job_bank: raise SchedulerError('Scheduler exiting with different amount of work than what was initially tasked!') @@ -230,9 +239,6 @@ def waitFinish(self): self.status_pool.close() self.status_pool.join() - # allow derived schedulers to perform any exit routines - self.notifyFinishedSchedulers() - except KeyboardInterrupt: self.killRemaining(keyboard=True) @@ -251,9 +257,8 @@ def schedule(self, testers): This process is serial. """ # If we are not to schedule any more jobs for some reason, return now - with self.__error_state_lock: - if self.__error_state: - return + if self.getErrorState(): + return # Nothing to do if there aren't any testers if not testers: return @@ -467,9 +472,8 @@ def runJob(self, job, jobs): """ Method the run_pool calls when an available thread becomes ready """ # Its possible, the queue is just trying to empty. Allow it to do so # with out generating overhead - with self.__error_state_lock: - if self.__error_state: - return + if self.getErrorState(): + return try: # see if we have enough slots to start this job From 2168e745b6d9f2f3188028e95d111d26a786e655 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 24 May 2024 12:20:50 -0600 Subject: [PATCH 139/243] Fix RunPBS updateHPCJobs after changes --- python/TestHarness/schedulers/RunPBS.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 7f65f563d65c..f1378bfd9fa8 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -22,10 +22,9 @@ def augmentJobSubmission(self, submission_env): if self.options.hpc_queue: submission_env['QUEUE'] = self.options.hpc_queue - def updateJobs(self, active_hpc_jobs): + def updateHPCJobs(self, active_hpc_jobs): # Poll for all of the jobs within a single call - active_job_ids = ','.join([x.id for x in active_hpc_jobs]) - cmd = ['qstat', '-xf', '-F', 'json'] + active_job_ids + cmd = ['qstat', '-xf', '-F', 'json'] + [x.id for x in active_hpc_jobs] exit_code, result, _ = self.callHPC(' '.join(cmd)) if exit_code != 0: return False From b99cc3bd0319b4acbe053d4d0f182146c2d53ef1 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 24 May 2024 13:33:24 -0600 Subject: [PATCH 140/243] Remove extraneous quotes --- .../tests/outputs/ray_tracing_mesh_output/tests | 8 ++++---- .../test/tests/traceray/internal_sidesets/tests | 14 +++++++------- .../ray_tracing_study/bc_create_ray/tests | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/modules/ray_tracing/test/tests/outputs/ray_tracing_mesh_output/tests b/modules/ray_tracing/test/tests/outputs/ray_tracing_mesh_output/tests index 8d4f35b5c417..2aba96d03bdf 100644 --- a/modules/ray_tracing/test/tests/outputs/ray_tracing_mesh_output/tests +++ b/modules/ray_tracing/test/tests/outputs/ray_tracing_mesh_output/tests @@ -57,7 +57,7 @@ Mesh/Partitioner/nx=2 Mesh/Partitioner/ny=2 Outputs/rays/type=RayTracingNemesis - "Outputs/rays/output_properties=\'ray_id intersections pid processor_crossings trajectory_changes\'" + Outputs/rays/output_properties="ray_id intersections pid processor_crossings trajectory_changes" Outputs/rays/file_base=nemesis_rays' exodiff = 'nemesis_rays.e.4.0 nemesis_rays.e.4.1 nemesis_rays.e.4.2 nemesis_rays.e.4.3' min_parallel = 4 @@ -74,7 +74,7 @@ Mesh/Partitioner/nx=2 Mesh/Partitioner/ny=2 Outputs/rays/type=RayTracingNemesis - "Outputs/rays/output_properties=\'intersections pid processor_crossings trajectory_changes\'" + Outputs/rays/output_properties="intersections pid processor_crossings trajectory_changes" Outputs/rays/file_base=ray_mesh_output_transient_nemesis_rays' # Missing some files here beacuse exodiff doesn't like diffing empty output, # which is the case for the early transient when not all procs are hit @@ -107,7 +107,7 @@ cli_args = 'Mesh/Partitioner/type=GridPartitioner Mesh/Partitioner/ny=3 UserObjects/study/segments_on_cache_traces=false - "Outputs/rays/output_properties=\'ray_id intersections pid processor_crossings trajectory_changes\'" + Outputs/rays/output_properties="ray_id intersections pid processor_crossings trajectory_changes" Outputs/rays/file_base=no_segments_rays' exodiff = 'no_segments_rays.e' min_parallel = 3 @@ -122,7 +122,7 @@ Mesh/Partitioner/ny=2 Outputs/rays/type=RayTracingNemesis UserObjects/study/segments_on_cache_traces=false - "Outputs/rays/output_properties=\'ray_id intersections pid processor_crossings trajectory_changes\'" + Outputs/rays/output_properties="ray_id intersections pid processor_crossings trajectory_changes" Outputs/rays/file_base=no_segments_nemesis_rays' exodiff = 'no_segments_nemesis_rays.e.2.0 no_segments_nemesis_rays.e.2.1' min_parallel = 2 diff --git a/modules/ray_tracing/test/tests/traceray/internal_sidesets/tests b/modules/ray_tracing/test/tests/traceray/internal_sidesets/tests index aba5a26a614e..77ecf040d835 100644 --- a/modules/ray_tracing/test/tests/traceray/internal_sidesets/tests +++ b/modules/ray_tracing/test/tests/traceray/internal_sidesets/tests @@ -9,7 +9,7 @@ input = 'internal_sidesets_1d.i' csvdiff = 'internal_sidesets_1d_kill_out.csv' cli_args = 'Outputs/file_base=internal_sidesets_1d_kill_out - RayBCs/active=\'kill_internal\'' + RayBCs/active=kill_internal' allow_test_objects = true detail = 'one-dimensional meshes, ' @@ -19,7 +19,7 @@ input = 'internal_sidesets_2d.i' csvdiff = 'internal_sidesets_2d_kill_out.csv' cli_args = 'Outputs/file_base=internal_sidesets_2d_kill_out - RayBCs/active=\'kill_internal\'' + RayBCs/active=kill_internal' allow_test_objects = true detail = 'two-dimensional meshes, ' @@ -29,7 +29,7 @@ input = 'internal_sidesets_3d.i' csvdiff = 'internal_sidesets_3d_kill_out.csv' cli_args = 'Outputs/file_base=internal_sidesets_3d_kill_out - RayBCs/active=\'kill_internal\'' + RayBCs/active=kill_internal' allow_test_objects = true detail = 'and three-dimensional meshes.' @@ -43,7 +43,7 @@ input = 'internal_sidesets_1d.i' csvdiff = 'internal_sidesets_1d_reflect_out.csv' cli_args = 'Outputs/file_base=internal_sidesets_1d_reflect_out - "RayBCs/active=\'kill_external reflect_internal\'"' + RayBCs/active="kill_external reflect_internal"' allow_test_objects = true detail = 'one-dimensional meshes, ' @@ -54,7 +54,7 @@ input = 'internal_sidesets_2d.i' csvdiff = 'internal_sidesets_2d_reflect_out.csv' cli_args = 'Outputs/file_base=internal_sidesets_2d_reflect_out - "RayBCs/active=\'kill_external reflect_internal\'"' + RayBCs/active="kill_external reflect_internal"' allow_test_objects = true detail = 'two-dimensional meshes, ' @@ -65,7 +65,7 @@ input = 'internal_sidesets_3d.i' csvdiff = 'internal_sidesets_3d_reflect_out.csv' cli_args = 'Outputs/file_base=internal_sidesets_3d_reflect_out - "RayBCs/active=\'kill_external reflect_internal\'"' + RayBCs/active="kill_external reflect_internal"' allow_test_objects = true detail = 'and three-dimensional meshes.' @@ -76,7 +76,7 @@ type = RunException input = 'internal_sidesets_1d.i' cli_args = 'UserObjects/study/use_internal_sidesets=false - RayBCs/active=\'kill_internal\'' + RayBCs/active=kill_internal' expect_err = 'RayBCs are defined on internal sidesets, but the study is not set to use internal sidesets during tracing.' allow_test_objects = true diff --git a/modules/ray_tracing/test/tests/userobjects/ray_tracing_study/bc_create_ray/tests b/modules/ray_tracing/test/tests/userobjects/ray_tracing_study/bc_create_ray/tests index a30c94b6b812..d4fbd70eb070 100644 --- a/modules/ray_tracing/test/tests/userobjects/ray_tracing_study/bc_create_ray/tests +++ b/modules/ray_tracing/test/tests/userobjects/ray_tracing_study/bc_create_ray/tests @@ -19,7 +19,7 @@ allow_test_objects = true cli_args = 'Outputs/file_base=bc_create_ray_3d_out Mesh/active=gmg_3d - "RayBCs/active=\'kill_3d create_3d\'"' + RayBCs/active="kill_3d create_3d"' detail = 'and in three-dimensional meshes.' [] [] From 6d5bc5d0bd37a496eabbb85ff4d91ad09c62e101 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 3 Jun 2024 15:12:01 -0600 Subject: [PATCH 141/243] Add sanity check for error code --- python/TestHarness/schedulers/RunSlurm.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/RunSlurm.py b/python/TestHarness/schedulers/RunSlurm.py index 56ee72e8dd98..800d744c8e9a 100644 --- a/python/TestHarness/schedulers/RunSlurm.py +++ b/python/TestHarness/schedulers/RunSlurm.py @@ -62,13 +62,16 @@ def updateHPCJobs(self, active_hpc_jobs): # Job was running and isn't running anymore, so it's done if hpc_job.getRunning() and state not in ['RUNNING', 'COMPLETING']: + exit_code = int(status['exitcode']) + if state == 'FAILED' and exit_code == 0: + raise Exception(f'Job {hpc_job.id} has unexpected exit code {exit_code} with FAILED state') # If a job COMPLETED, it's done with exit code 0 so everything # went well. If it FAILED, it finished but returned with a # non-zero exit code, which will be handled by the Tester. if state not in ['FAILED', 'COMPLETED']: hpc_job.job.setStatus(hpc_job.job.error, f'SLURM ERROR: {state}') - self.setHPCJobDone(hpc_job, int(status['exitcode'])) + self.setHPCJobDone(hpc_job, exit_code) # Success return True From 6fd4a5c48e7bda08b49916a766cb51b0ce99ef7a Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 4 Jun 2024 08:58:20 -0600 Subject: [PATCH 142/243] Only run mpiexec in parallel --- python/TestHarness/testers/RunApp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/testers/RunApp.py b/python/TestHarness/testers/RunApp.py index 5e9248dc2058..bbec768e666f 100644 --- a/python/TestHarness/testers/RunApp.py +++ b/python/TestHarness/testers/RunApp.py @@ -237,7 +237,7 @@ def getCommand(self, options): elif nthreads > 1: command = command + ' --n-threads=' + str(nthreads) - if self.force_mpi or options.parallel or ncpus > 1: + if self.force_mpi or ncpus > 1: command = f'{self.mpi_command} -n {ncpus} {command}' # Arbitrary proxy command, but keep track of the command so that someone could use it later From 483d095d71a4c189cd3d01a7ee6496ab7a7c95da Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 4 Jun 2024 08:58:57 -0600 Subject: [PATCH 143/243] Allow RunException tests to run in parallel This should hopefully work now --- python/TestHarness/testers/RunException.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/python/TestHarness/testers/RunException.py b/python/TestHarness/testers/RunException.py index cb8c59e8b2a3..6a9791036f08 100644 --- a/python/TestHarness/testers/RunException.py +++ b/python/TestHarness/testers/RunException.py @@ -52,13 +52,3 @@ def getOutputFiles(self, options): if self.hasRedirectedOutput(options): return self.getRedirectedOutputFiles(options) return [] - - def getProcs(self, options): - procs = super().getProcs(options) - # We seem to have issues with --redirect-output causing - # "Inappropriate ioctl for device (25)" errors, so if this test - # doesn't require more procs, just set it to zero - if options.hpc and int(self.specs['min_parallel']) == 1 and procs != 1: - self.addCaveats('hpc max_cpus=1') - return 1 - return procs From 9a393a94a4b38ff9d0d3bede26266f171d35299e Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 5 Jun 2024 08:39:37 -0600 Subject: [PATCH 144/243] Correct apptainer exit codes greater than 128 --- python/TestHarness/schedulers/RunHPC.py | 3 +++ python/TestHarness/schedulers/hpc_template | 11 +++++++++++ 2 files changed, 14 insertions(+) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index ec6a0656ee36..c44e92578278 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -356,6 +356,9 @@ def submitJob(self, job): # Append the apptainer command along with the command to be ran job_command += f"{apptainer_command} '{command}'" job_command_printable += f"{apptainer_command} \'\\'\'{command}\'\\'\'" + + # Set that we're using apptainer + submission_env['USING_APPTAINER'] = '1' # Not in apptainer, so we can just use the escaped command as is else: job_command = f"'{command}'" diff --git a/python/TestHarness/schedulers/hpc_template b/python/TestHarness/schedulers/hpc_template index da80a61daad5..77e1132ee90a 100644 --- a/python/TestHarness/schedulers/hpc_template +++ b/python/TestHarness/schedulers/hpc_template @@ -61,6 +61,17 @@ set -e # We will read this output later on to try to capture the return code # in the event that PBS doesn't get it to us correctly echo "################################################################################" +{%- if USING_APPTAINER is defined %} +# We have a special case with exit codes when we run within apptainer. Sometimes when +# codes are received when running in a container, the container will return with exit code +# 128 + . Capture that here because we don't wanna exit code a code > 128, which +# are special exit codes for the schedulers. +if ((return_code > 128)); then + new_return_code=$((return_code - 128)) + echo "Apptainer exited with code $return_code, using $new_return_code instead" + return_code=$new_return_code +fi +{%- endif %} echo "Completed TestHarness {{ SCHEDULER_NAME }} test execution; exit code = $return_code" # Append a terminator to all of the output files for file syncing across NFS From b9517dee4416ac5244ed6d6ffc1bcb743d2f71e7 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 5 Jun 2024 09:41:00 -0600 Subject: [PATCH 145/243] Skip this test on HPC --- test/tests/misc/intermittent_failure/tests | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/tests/misc/intermittent_failure/tests b/test/tests/misc/intermittent_failure/tests index d33b0bd8c830..9221f6868ce7 100644 --- a/test/tests/misc/intermittent_failure/tests +++ b/test/tests/misc/intermittent_failure/tests @@ -10,5 +10,8 @@ valgrind = 'NONE' method = '!dbg' + + # HPC runs could lead to a timeout + hpc = false [../] [] From bcaffc6bcbae06cca7be43658493c5df8e1a0ab8 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 5 Jun 2024 13:16:57 -0600 Subject: [PATCH 146/243] Skip time based test on hpc --- test/tests/time_steppers/time_adaptive/tests | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/tests/time_steppers/time_adaptive/tests b/test/tests/time_steppers/time_adaptive/tests index 004b4c1990eb..211b25805acd 100644 --- a/test/tests/time_steppers/time_adaptive/tests +++ b/test/tests/time_steppers/time_adaptive/tests @@ -13,5 +13,8 @@ # the execution slows down so much that the set delays in SlowProblem become # small compared to the overall execution overhead and the test fails. valgrind = NONE + + # HPC runtime isn't reliable enough + hpc = false [] [] From d4d9e29a3632f6b1a03fc7c27229b55173ff38ac Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 10 Jun 2024 10:59:33 -0600 Subject: [PATCH 147/243] Don't allow use_shell=True with HPC --- python/TestHarness/testers/Tester.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index 6728b66c3386..daa239779470 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -711,6 +711,10 @@ def checkRunnableBase(self, options): if not self.specs['hpc'] and options.hpc: reasons['hpc'] = 'hpc=false' + # Use shell not supported for HPC + if self.specs['use_shell'] and options.hpc: + reasons['use_shell'] = 'no use_shell with hpc' + ##### The below must be performed last to register all above caveats ##### # Remove any matching user supplied caveats from accumulated checkRunnable caveats that # would normally produce a skipped test. From 13abd77e23fbcf9dd32cccf6d0acca65aea209cc Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 10 Jun 2024 10:59:49 -0600 Subject: [PATCH 148/243] Hopefully make the HPC shell escape more versatile --- python/TestHarness/schedulers/RunHPC.py | 46 ++++++++++++------------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index c44e92578278..613bb19484f4 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -11,6 +11,7 @@ import threading, os, re, sys, datetime, shlex, socket, threading, time import paramiko import jinja2 +import shlex from multiprocessing.pool import ThreadPool from timeit import default_timer as clock @@ -310,15 +311,23 @@ def submitJob(self, job): if self.source_contents: submission_env['SOURCE_CONTENTS'] = self.source_contents - # The command to be ran. We're going to wrap this command in single quotes - # so that we don't bash evaluate anything, hence the replacement of a - # single quote. Yes, this truly is madness. But it looks like it works. - # Pro tip: don't ever have to run things in bash with complex syntax - # that is quite bash like. + # Get the unescaped command command = tester.getCommand(options) + + # Parse out the mpi command from the command if we're running in apptainer. + # We do this before any of the other escaping + APPTAINER_CONTAINER = os.environ.get('APPTAINER_CONTAINER') + apptainer_command_prefix = '' + if APPTAINER_CONTAINER: + mpi_command = self.parseMPICommand(command) + if mpi_command: + apptainer_command_prefix = mpi_command + command = command.replace(mpi_command, '') + + # Replace newlines, clean up spaces, escape all shell commands command = command.replace('\n', ' ') - command = command.replace('"', "'") - command = command.replace("'", "\'\\'\'") + command = ' '.join(command.split()) + command = shlex.quote(command) # Special logic for when we're running with apptainer, in which case # we need to manipulate the command like such @@ -326,20 +335,9 @@ def submitJob(self, job): # New command: apptainer exec /path/to/image '' # This is also the reason why we have to form job_command_printable; # the extra quotes around need to be escaped. - APPTAINER_CONTAINER = os.environ.get('APPTAINER_CONTAINER') if APPTAINER_CONTAINER: - # Separate out the MPI command - mpi_command = self.parseMPICommand(command) - # Add MPI command as the prefix and remove it from the base command - if mpi_command: - command_prefix = mpi_command - command = command.replace(mpi_command, '') - # No MPI command; nothing to do - else: - command_prefix = '' - - job_command = command_prefix - job_command_printable = command_prefix + job_command = apptainer_command_prefix + job_command_printable = apptainer_command_prefix # The root filesystem path that we're in so that we can be sure to bind # it into the container, if not already set @@ -354,15 +352,15 @@ def submitJob(self, job): apptainer_command.append(APPTAINER_CONTAINER) apptainer_command = shlex.join(apptainer_command) # Append the apptainer command along with the command to be ran - job_command += f"{apptainer_command} '{command}'" - job_command_printable += f"{apptainer_command} \'\\'\'{command}\'\\'\'" + job_command += f"{apptainer_command} {command}" + job_command_printable += f"{apptainer_command} \'\\'\'{command[1:-1]}\'\\'\'" # Set that we're using apptainer submission_env['USING_APPTAINER'] = '1' # Not in apptainer, so we can just use the escaped command as is else: - job_command = f"'{command}'" - job_command_printable += f"\'\\'\'{command}\'\\'\'" + job_command = command + job_command_printable += f"\'\\'\'{command[1:-1]}\'\\'\'" submission_env['COMMAND'] = job_command submission_env['COMMAND_PRINTABLE'] = job_command_printable From b905817821937664eba48cc1325e7180175d7c9a Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 10 Jun 2024 11:00:07 -0600 Subject: [PATCH 149/243] Remove redundant skip due to use_shell=True on HPC --- test/tests/make_install/tests | 1 - 1 file changed, 1 deletion(-) diff --git a/test/tests/make_install/tests b/test/tests/make_install/tests index 3faabad200c8..0b894bf64125 100644 --- a/test/tests/make_install/tests +++ b/test/tests/make_install/tests @@ -34,7 +34,6 @@ mkdir -p ../../../make_install_test' use_shell = True installation_type = IN_TREE - hpc = False detail = 'from a pre-determined user-readable location;' [] From ea2b6b14bef0958f56eb8ed6331a18134267e973 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 10 Jun 2024 18:27:00 -0600 Subject: [PATCH 150/243] Run the HPC command within python instead to avoid escapes --- python/TestHarness/schedulers/RunHPC.py | 27 +++++++-------- python/TestHarness/schedulers/hpc_run.py | 38 ++++++++++++++++++++++ python/TestHarness/schedulers/hpc_template | 1 - 3 files changed, 52 insertions(+), 14 deletions(-) create mode 100755 python/TestHarness/schedulers/hpc_run.py diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 613bb19484f4..bc7ade0cc157 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -7,13 +7,12 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html +import urllib.parse from RunParallel import RunParallel -import threading, os, re, sys, datetime, shlex, socket, threading, time +import threading, os, re, sys, datetime, shlex, socket, threading, time, urllib import paramiko import jinja2 -import shlex from multiprocessing.pool import ThreadPool -from timeit import default_timer as clock class HPCJob: """ @@ -324,20 +323,24 @@ def submitJob(self, job): apptainer_command_prefix = mpi_command command = command.replace(mpi_command, '') - # Replace newlines, clean up spaces, escape all shell commands + # Replace newlines, clean up spaces, and encode the command. We encode the + # command here to be able to pass it to a python script to run later without + # dealing with any substitution or evaluation within a shell. Thus, this is + # akin to the SubprocessRunner also running commands. It's a bit complicated, + # but I promise that it's much better than the alternative command = command.replace('\n', ' ') command = ' '.join(command.split()) - command = shlex.quote(command) + command_encoded = urllib.parse.quote(command) + + # Script used to decode the command as described above + hpc_run = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'hpc_run.py') # Special logic for when we're running with apptainer, in which case # we need to manipulate the command like such # Original command: # New command: apptainer exec /path/to/image '' - # This is also the reason why we have to form job_command_printable; - # the extra quotes around need to be escaped. if APPTAINER_CONTAINER: job_command = apptainer_command_prefix - job_command_printable = apptainer_command_prefix # The root filesystem path that we're in so that we can be sure to bind # it into the container, if not already set @@ -351,19 +354,17 @@ def submitJob(self, job): apptainer_command.append('--no-home') apptainer_command.append(APPTAINER_CONTAINER) apptainer_command = shlex.join(apptainer_command) + # Append the apptainer command along with the command to be ran - job_command += f"{apptainer_command} {command}" - job_command_printable += f"{apptainer_command} \'\\'\'{command[1:-1]}\'\\'\'" + job_command += f"{apptainer_command} {hpc_run} {command_encoded}" # Set that we're using apptainer submission_env['USING_APPTAINER'] = '1' # Not in apptainer, so we can just use the escaped command as is else: - job_command = command - job_command_printable += f"\'\\'\'{command[1:-1]}\'\\'\'" + job_command = f'{hpc_run} {command_encoded}' submission_env['COMMAND'] = job_command - submission_env['COMMAND_PRINTABLE'] = job_command_printable # The output files that we're expected to generate so that the # HPC job can add a terminator for them so that we can verify diff --git a/python/TestHarness/schedulers/hpc_run.py b/python/TestHarness/schedulers/hpc_run.py new file mode 100755 index 000000000000..d0f9a68a450c --- /dev/null +++ b/python/TestHarness/schedulers/hpc_run.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +#* This file is part of the MOOSE framework +#* https://www.mooseframework.org +#* +#* All rights reserved, see COPYRIGHT for full restrictions +#* https://github.com/idaholab/moose/blob/master/COPYRIGHT +#* +#* Licensed under LGPL 2.1, please see LICENSE for details +#* https://www.gnu.org/licenses/lgpl-2.1.html + +import os, shlex, subprocess, sys, urllib.parse + +# This is a helper script for running an external process in HPC _not_ +# within a shell, which allows for continuity of running things on HPC +# just like we run them within the SubprocessRunner. It allows us to not +# deal with escaping all kinds of crud as we execute it within a shell. +# It takes a single argument, which is the url encoded thing to run, +# decodes it, and runs it. +if __name__ == '__main__': + if len(sys.argv) != 2: + print('ERROR: Expected single argument of the encoded command to run') + sys.exit(1) + + # The command should be encoded on other end with urrllib.parse.quote + encoded_command = sys.argv[1] + command = shlex.split(urllib.parse.unquote(encoded_command)) + + # Try to only print this on rank 0 + if os.environ.get('PMI_RANK', '0') == '0': + print('Running decoded command:', ' '.join(command), flush=True) + + # Run the thing; close_fds=False needed for MPI + process = subprocess.run(command, + stdout=sys.stdout, + stderr=sys.stderr, + close_fds=False) + # This is a wrapper so just exit with the code of whatever we ran + sys.exit(process.returncode) diff --git a/python/TestHarness/schedulers/hpc_template b/python/TestHarness/schedulers/hpc_template index 77e1132ee90a..38c908de489f 100644 --- a/python/TestHarness/schedulers/hpc_template +++ b/python/TestHarness/schedulers/hpc_template @@ -35,7 +35,6 @@ echo "TestHarness {{ SCHEDULER_NAME }} job on $(hostname) in job ${{ JOB_ID_VARI echo "Time: $(date)" echo 'Test: {{ TEST_SPEC }}:{{ TEST_NAME }}' echo 'Directory: {{ CWD }}' -echo 'Command: {{ COMMAND_PRINTABLE }}' echo 'Submitted hostname: {{ SUBMITTED_HOSTNAME }}' echo 'Submission script: {{ SUBMISSION_SCRIPT }}' echo 'Output: {{ OUTPUT }}' From d32bd9e381735e985d617b13ddecddc7e344c5b5 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 10 Jun 2024 20:24:56 -0600 Subject: [PATCH 151/243] Let passing test files take a bit longer to appear --- python/TestHarness/TestHarness.py | 2 +- python/TestHarness/runners/HPCRunner.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 3a402b6663ce..9a24a1edee9c 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -1071,7 +1071,7 @@ def parseCLArgs(self, argv): hpcgroup.add_argument('--hpc', dest='hpc', action='store', choices=['pbs', 'slurm'], help='Launch tests using a HPC scheduler') hpcgroup.add_argument('--hpc-host', nargs='+', action='store', dest='hpc_host', metavar='', help='The host(s) to use for submitting HPC jobs') hpcgroup.add_argument('--hpc-pre-source', nargs=1, action="store", dest='hpc_pre_source', metavar='', help='Source specified file before launching HPC tests') - hpcgroup.add_argument('--hpc-file-timeout', nargs=1, type=int, action='store', dest='hpc_file_timeout', default=120, help='The time in seconds to wait for HPC output') + hpcgroup.add_argument('--hpc-file-timeout', nargs=1, type=int, action='store', dest='hpc_file_timeout', default=300, help='The time in seconds to wait for HPC output') hpcgroup.add_argument('--hpc-place', nargs=1, action='store', dest='hpc_place', choices=['free', 'pack', 'scatter'], default='free', help='The default placement method for HPC jobs') hpcgroup.add_argument('--hpc-apptainer-bindpath', nargs=1, action='store', type=str, dest='hpc_apptainer_bindpath', help='Sets the apptainer bindpath for HPC jobs') hpcgroup.add_argument('--hpc-apptainer-no-home', action='store_true', dest='hpc_apptainer_no_home', help='Passes --no-home to apptainer for HPC jobs') diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index cc04f0ef08ac..4700961a7b65 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -59,8 +59,9 @@ def wait(self, timer): # If the Job is already finished, something happened in PBS # so we have an invalid state for processing in the Tester if self.job.isFinished(): - # If we have _some_ output, at least try to load it. - for i in range(int(self.options.hpc_file_timeout / self.file_completion_poll_time)): + # If we have _some_ output, at least try to load it. However, don't wait + # a while for this one. + for i in range(int(60 / self.file_completion_poll_time)): if self.trySetOutput(): break time.sleep(self.file_completion_poll_time) From 6e4a1a7a0eb51d2528739efa6d0a5fcbe21c4d67 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 11 Jun 2024 14:29:03 -0600 Subject: [PATCH 152/243] Parse a timeout separately --- python/TestHarness/schedulers/RunPBS.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index f1378bfd9fa8..eb9d8e130f22 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -48,11 +48,13 @@ def updateHPCJobs(self, active_hpc_jobs): # If we were running but now we're done, we're not running anymore if exit_code is not None: job = hpc_job.job - # Negative exit code, means PBS set a reason if exit_code < 0: name, reason = PBS_User_EXITCODES.get(exit_code, ('TERMINATED', 'Unknown reason')) - job.setStatus(job.error, f'PBS ERROR: {name}') - job.appendOutput(util.outputHeader(f'PBS terminated job with reason: {reason}')) + if name == 'JOB_EXEC_KILL_WALLTIME': + job.setStatus(job.timeout, 'TIMEOUT') + else: + job.setStatus(job.error, f'PBS ERROR: {name}') + job.appendOutput(util.outputHeader(f'PBS terminated job with reason: {reason}')) # Job was killed with a signal elif exit_code >= 128: job.setStatus(job.error, f'PBS JOB KILLED') From c7c8ba9be4229360c56f9b8266ec1263cb5b6240 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 11 Jun 2024 16:19:25 -0600 Subject: [PATCH 153/243] Also set preexec_fn --- python/TestHarness/schedulers/hpc_run.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/hpc_run.py b/python/TestHarness/schedulers/hpc_run.py index d0f9a68a450c..745d8cdf5d6d 100755 --- a/python/TestHarness/schedulers/hpc_run.py +++ b/python/TestHarness/schedulers/hpc_run.py @@ -33,6 +33,7 @@ process = subprocess.run(command, stdout=sys.stdout, stderr=sys.stderr, - close_fds=False) + close_fds=False, + preexec_fn=os.setsid) # This is a wrapper so just exit with the code of whatever we ran sys.exit(process.returncode) From 3178d6465d4e9bcb668688cffc48c1acb40e3e06 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 17 Jun 2024 09:39:31 -0600 Subject: [PATCH 154/243] Remove preexec_fn --- python/TestHarness/schedulers/hpc_run.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/TestHarness/schedulers/hpc_run.py b/python/TestHarness/schedulers/hpc_run.py index 745d8cdf5d6d..d0f9a68a450c 100755 --- a/python/TestHarness/schedulers/hpc_run.py +++ b/python/TestHarness/schedulers/hpc_run.py @@ -33,7 +33,6 @@ process = subprocess.run(command, stdout=sys.stdout, stderr=sys.stderr, - close_fds=False, - preexec_fn=os.setsid) + close_fds=False) # This is a wrapper so just exit with the code of whatever we ran sys.exit(process.returncode) From ad9068078da2130faf082f63452d2795f057c542 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 17 Jun 2024 10:28:22 -0600 Subject: [PATCH 155/243] Disable redirected output for HPC --- python/TestHarness/testers/RunApp.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/python/TestHarness/testers/RunApp.py b/python/TestHarness/testers/RunApp.py index bbec768e666f..301712ab8b3f 100644 --- a/python/TestHarness/testers/RunApp.py +++ b/python/TestHarness/testers/RunApp.py @@ -124,6 +124,14 @@ def checkRunnable(self, options): self.setStatus(self.skip) return False + # We have non-deterministic issues when running with the HPC python wrapper + # and using --redirected-output. If the user explicitly requested more + # parallel procs, we can't run this + if options.hpc and self.specs['redirect_output'] == True and int(self.specs['min_parallel']) > 1: + self.addCaveats('hpc min_cpus=1') + self.setStatus(self.skip) + return False + return True def getThreads(self, options): @@ -153,11 +161,20 @@ def getProcs(self, options): else: default_ncpus = options.parallel + min_parallel = int(self.specs['min_parallel']) + # Raise the floor - ncpus = max(default_ncpus, int(self.specs['min_parallel'])) + ncpus = max(default_ncpus, min_parallel) # Lower the ceiling ncpus = min(ncpus, int(self.specs['max_parallel'])) + # We have non-deterministic issues when running with the HPC python wrapper + # and using --redirected-output. Here, if the user didn't explicitly request + # to use more parallel procs, we'll limit it to 1 + if options.hpc and self.specs['redirect_output'] == True and min_parallel == 1 and ncpus > 1: + self.addCaveats('hpc min_cpus=1') + return 1 + if ncpus > default_ncpus: self.addCaveats('min_cpus=' + str(ncpus)) elif ncpus < default_ncpus: From 5897f0b3c41aa7976c9c7f2beea453702541ac3d Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 17 Jun 2024 14:12:26 -0600 Subject: [PATCH 156/243] Get --hpc-place=scatter working with slurm --- python/TestHarness/schedulers/RunHPC.py | 3 ++- python/TestHarness/schedulers/RunPBS.py | 1 - python/TestHarness/schedulers/hpc_template | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index bc7ade0cc157..3bfc9d5aea63 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -304,7 +304,8 @@ def submitJob(self, job): 'NUM_PROCS': tester.getProcs(options), 'NUM_THREADS': tester.getThreads(options), 'ENDING_COMMENT': self.getOutputEndingComment(f'${self.getHPCJobIDVariable()}'), - 'JOB_ID_VARIABLE': self.getHPCJobIDVariable()} + 'JOB_ID_VARIABLE': self.getHPCJobIDVariable(), + 'PLACE': self.options.hpc_place} if self.options.hpc_pre_source: submission_env['SOURCE_FILE'] = options.hpc_pre_source if self.source_contents: diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index eb9d8e130f22..548002891211 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -18,7 +18,6 @@ class RunPBS(RunHPC): Scheduler class for the PBS HPC scheduler. """ def augmentJobSubmission(self, submission_env): - submission_env['PLACE'] = self.options.hpc_place if self.options.hpc_queue: submission_env['QUEUE'] = self.options.hpc_queue diff --git a/python/TestHarness/schedulers/hpc_template b/python/TestHarness/schedulers/hpc_template index 38c908de489f..de243e8a42d9 100644 --- a/python/TestHarness/schedulers/hpc_template +++ b/python/TestHarness/schedulers/hpc_template @@ -17,6 +17,9 @@ #SBATCH --time={{ WALLTIME }} #SBATCH --wckey={{ PROJECT }} #SBATCH --output={{ OUTPUT }} +{%- if PLACE == "scatter" %} +#SBATCH --ntasks-per-node=1 +{%- endif %} {%- endif %} # Exit on failure From 65549b31ac26c124c92034b792bf0d9fb22122ed Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 17 Jun 2024 14:23:20 -0600 Subject: [PATCH 157/243] Setup slurm for a recoverable timeout --- python/TestHarness/schedulers/RunSlurm.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/python/TestHarness/schedulers/RunSlurm.py b/python/TestHarness/schedulers/RunSlurm.py index 800d744c8e9a..e858b0402920 100644 --- a/python/TestHarness/schedulers/RunSlurm.py +++ b/python/TestHarness/schedulers/RunSlurm.py @@ -65,11 +65,18 @@ def updateHPCJobs(self, active_hpc_jobs): exit_code = int(status['exitcode']) if state == 'FAILED' and exit_code == 0: raise Exception(f'Job {hpc_job.id} has unexpected exit code {exit_code} with FAILED state') + + job = hpc_job.job + + # Job has timed out; setting a timeout status means that this + # state is recoverable + if state == 'TIMEOUT': + job.setStatus(job.timeout, 'TIMEOUT') # If a job COMPLETED, it's done with exit code 0 so everything # went well. If it FAILED, it finished but returned with a # non-zero exit code, which will be handled by the Tester. - if state not in ['FAILED', 'COMPLETED']: - hpc_job.job.setStatus(hpc_job.job.error, f'SLURM ERROR: {state}') + elif state not in ['FAILED', 'COMPLETED']: + job.setStatus(job.error, f'SLURM ERROR: {state}') self.setHPCJobDone(hpc_job, exit_code) From 54ec3a793b453e08595acbf1a4b22aada6742062 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 17 Jun 2024 14:25:24 -0600 Subject: [PATCH 158/243] Set file timeout as a recoverable error --- python/TestHarness/runners/HPCRunner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index 4700961a7b65..f9bd7e939fe7 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -102,7 +102,7 @@ def wait(self, timer): # We've waited for files for too long if (wait_files or incomplete_files) and waited_time >= self.options.hpc_file_timeout: - self.job.setStatus(self.job.error, 'FILE TIMEOUT') + self.job.setStatus(self.job.timeout, 'FILE TIMEOUT') if not self.output_completed: self.trySetOutput() def print_files(files, type): From d63f4fe92543c5399c70b8b07c5dae290c1d6ac8 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 18 Jun 2024 09:11:33 -0600 Subject: [PATCH 159/243] Handle re-queued jobs with PBS --- python/TestHarness/schedulers/RunHPC.py | 23 ++++++++++++++++++++++- python/TestHarness/schedulers/RunPBS.py | 8 ++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 3bfc9d5aea63..48140f6f609b 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -421,7 +421,6 @@ def submitJob(self, job): # Here we append job_id if the ID is just a number so that it's more # obvious what it is job.addCaveats(f'job={job_id}' if job_id.isdigit() else job_id) - self.setAndOutputJobStatus(job, job.queued, caveats=True) # Setup the job in the status map with self.hpc_jobs_lock: @@ -430,6 +429,9 @@ def submitJob(self, job): hpc_job = HPCJob(job, job_id, job_command) self.hpc_jobs[job] = hpc_job + # Set the job as queued and print out that it is queued + self.setHPCJobQueued(hpc_job) + # If the updater hasn't been started yet, start it. # We do this here because it's locked within hpc_jobs_lock # and it means that we won't start looking for jobs until @@ -509,6 +511,25 @@ def setHPCJobRunning(self, hpc_job): # Print out that the job is now running self.setAndOutputJobStatus(hpc_job.job, hpc_job.job.running, caveats=True) + def setHPCJobQueued(self, hpc_job): + """ + Sets the given HPC job as being queued. + + This can be used when the HPC scheduler re-schedules the job. + + This should be called within the overridden updateHPCJobs(). + """ + # Guard against setting this as requeued multiple times + if hpc_job.job.getStatus() == hpc_job.job.queued: + return + + # This is currently thread safe because we only ever change + # it within updateJobs(), which is only ever executed serially + # within the thread the calls _updateHPCJobs() + hpc_job.set(running=False) + # Print out that the job is queued again + self.setAndOutputJobStatus(hpc_job.job, hpc_job.job.queued, caveats=True) + def setHPCJobDone(seflf, hpc_job, exit_code): """ Sets the given HPC job as done. diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 548002891211..9e62f7277475 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -49,8 +49,16 @@ def updateHPCJobs(self, active_hpc_jobs): job = hpc_job.job if exit_code < 0: name, reason = PBS_User_EXITCODES.get(exit_code, ('TERMINATED', 'Unknown reason')) + # Job timed out; give this a special timeout status because + # it is then marked as recoverable (could try running again) if name == 'JOB_EXEC_KILL_WALLTIME': job.setStatus(job.timeout, 'TIMEOUT') + # Special status where the job failed to start due to a PBS + # issue and will be started again, so there's nothing to do + elif name == 'JOB_EXEC_HOOK_RERUN': + self.setHPCJobQueued(hpc_job) + continue + # Everything else should be an error else: job.setStatus(job.error, f'PBS ERROR: {name}') job.appendOutput(util.outputHeader(f'PBS terminated job with reason: {reason}')) From 53adaacf9b815750185d532732bb8fcea54d1d20 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 18 Jun 2024 09:58:17 -0600 Subject: [PATCH 160/243] Automatically kill held jobs on PBS --- python/TestHarness/schedulers/RunHPC.py | 6 +++--- python/TestHarness/schedulers/RunPBS.py | 16 ++++++++++++++-- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 48140f6f609b..956d6bc5e7cd 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -9,7 +9,7 @@ import urllib.parse from RunParallel import RunParallel -import threading, os, re, sys, datetime, shlex, socket, threading, time, urllib +import threading, os, re, sys, datetime, shlex, socket, threading, time, urllib, contextlib import paramiko import jinja2 from multiprocessing.pool import ThreadPool @@ -559,9 +559,9 @@ def augmentJobs(self, jobs): if max_time == tester.getDefaultMaxTime(): tester.setMaxTime(max_time * 2) - def killJob(self, job): + def killJob(self, job, lock=True): """Kills a HPC job""" - with self.hpc_jobs_lock: + with self.hpc_jobs_lock if lock else contextlib.suppress(): hpc_job = self.hpc_jobs.get(job) if hpc_job is None or hpc_job.getDone() or hpc_job.getKilled(): return diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 9e62f7277475..82c112225544 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -33,6 +33,8 @@ def updateHPCJobs(self, active_hpc_jobs): job_results = json_result['Jobs'] for hpc_job in active_hpc_jobs: + job = hpc_job.job + # This job's result from the qstat command job_result = job_results[hpc_job.id] exit_code = job_result.get('Exit_status') @@ -40,13 +42,23 @@ def updateHPCJobs(self, active_hpc_jobs): exit_code = int(exit_code) state = job_result.get('job_state') - # Get the job state, and report running if it switched to running + # The job has switched to running if state == 'R' and not hpc_job.getRunning(): self.setHPCJobRunning(hpc_job) + # The job is held, so we're going to consider it a failure and + # will also try to cancel it so that it doesn't hang around + if state == 'H': + job.setStatus(job.error, f'PBS JOB HELD') + job.appendOutput(util.outputHeader('The submitted PBS job was held; killed job')) + exit_code = 1 + try: + self.killJob(job, lock=False) # no lock; we're already in one + except: + pass + # If we were running but now we're done, we're not running anymore if exit_code is not None: - job = hpc_job.job if exit_code < 0: name, reason = PBS_User_EXITCODES.get(exit_code, ('TERMINATED', 'Unknown reason')) # Job timed out; give this a special timeout status because From 34732d20effc1acaea3ed25ae6001f9a0c80ac60 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 18 Jun 2024 10:14:52 -0600 Subject: [PATCH 161/243] Address a PBS job being killed early --- python/TestHarness/runners/HPCRunner.py | 5 +++-- python/TestHarness/schedulers/RunHPC.py | 13 +++++++++++++ python/TestHarness/schedulers/RunPBS.py | 11 +++++++---- python/TestHarness/schedulers/RunSlurm.py | 2 +- 4 files changed, 24 insertions(+), 7 deletions(-) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index f9bd7e939fe7..275b02dbf4b5 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -56,8 +56,9 @@ def wait(self, timer): # The PBS output (stdout+stderr) output_file = self.run_hpc.getHPCJobOutputPath(self.job) - # If the Job is already finished, something happened in PBS - # so we have an invalid state for processing in the Tester + # If the Job is already finished, something happened in the + # HPC scheduler so we have an invalid state for processing + # in the Tester if self.job.isFinished(): # If we have _some_ output, at least try to load it. However, don't wait # a while for this one. diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 956d6bc5e7cd..febde3d46846 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -13,6 +13,7 @@ import paramiko import jinja2 from multiprocessing.pool import ThreadPool +from TestHarness import util class HPCJob: """ @@ -669,3 +670,15 @@ def parseMPICommand(command) -> str: if find_mpi is not None: return find_mpi.group(0) return None + + @staticmethod + def setHPCJobError(hpc_job, message, output=None): + """ + Helper for setting an error within a HPC job. + + Should be used within the derived classes updateHPCJobs(). + """ + job = hpc_job.job + job.setStatus(job.error, message) + if output: + job.appendOutput(util.outputHeader(f'Job {hpc_job.id} {output}')) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 82c112225544..33ab80e52330 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -49,14 +49,18 @@ def updateHPCJobs(self, active_hpc_jobs): # The job is held, so we're going to consider it a failure and # will also try to cancel it so that it doesn't hang around if state == 'H': - job.setStatus(job.error, f'PBS JOB HELD') - job.appendOutput(util.outputHeader('The submitted PBS job was held; killed job')) + self.setHPCJobError(hpc_job, 'PBS JOB HELD', 'was held; killed job') exit_code = 1 try: self.killJob(job, lock=False) # no lock; we're already in one except: pass + # Job finished before it started, so something killed it + if state == 'F' and exit_code is None: + self.setHPCJobError(hpc_job, 'PBS JOB KILLED', 'was killed') + exit_code = 1 + # If we were running but now we're done, we're not running anymore if exit_code is not None: if exit_code < 0: @@ -72,8 +76,7 @@ def updateHPCJobs(self, active_hpc_jobs): continue # Everything else should be an error else: - job.setStatus(job.error, f'PBS ERROR: {name}') - job.appendOutput(util.outputHeader(f'PBS terminated job with reason: {reason}')) + self.setHPCJobError(hpc_job, f'PBS ERROR: {name}', f'was terminated with reason: {reason}') # Job was killed with a signal elif exit_code >= 128: job.setStatus(job.error, f'PBS JOB KILLED') diff --git a/python/TestHarness/schedulers/RunSlurm.py b/python/TestHarness/schedulers/RunSlurm.py index e858b0402920..03fec249afeb 100644 --- a/python/TestHarness/schedulers/RunSlurm.py +++ b/python/TestHarness/schedulers/RunSlurm.py @@ -76,7 +76,7 @@ def updateHPCJobs(self, active_hpc_jobs): # went well. If it FAILED, it finished but returned with a # non-zero exit code, which will be handled by the Tester. elif state not in ['FAILED', 'COMPLETED']: - job.setStatus(job.error, f'SLURM ERROR: {state}') + self.setHPCJobError(hpc_job, f'SLURM ERROR: {state}', f'encountered SLURM state {state}') self.setHPCJobDone(hpc_job, exit_code) From d82863e2a0e753b5eb603a78aef5749c6922b621 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 18 Jun 2024 10:19:15 -0600 Subject: [PATCH 162/243] Use the same callback for this error --- python/TestHarness/schedulers/RunPBS.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 33ab80e52330..79bbe13cb9d9 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -79,7 +79,7 @@ def updateHPCJobs(self, active_hpc_jobs): self.setHPCJobError(hpc_job, f'PBS ERROR: {name}', f'was terminated with reason: {reason}') # Job was killed with a signal elif exit_code >= 128: - job.setStatus(job.error, f'PBS JOB KILLED') + self.setHPCJobError(hpc_job, 'PBS JOB KILLED', 'was killed by a signal') self.setHPCJobDone(hpc_job, exit_code) From 569828ef32e422505b16a5dd458d74134283ad23 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 18 Jun 2024 12:00:32 -0600 Subject: [PATCH 163/243] Also set queued with JOB_EXEC_RETRY --- python/TestHarness/schedulers/RunPBS.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 79bbe13cb9d9..773b98abf767 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -71,7 +71,7 @@ def updateHPCJobs(self, active_hpc_jobs): job.setStatus(job.timeout, 'TIMEOUT') # Special status where the job failed to start due to a PBS # issue and will be started again, so there's nothing to do - elif name == 'JOB_EXEC_HOOK_RERUN': + elif name in ['JOB_EXEC_HOOK_RERUN', 'JOB_EXEC_RETRY']: self.setHPCJobQueued(hpc_job) continue # Everything else should be an error From 326d0fd27f2464643a9afe463c06f29deeae7823 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 18 Jun 2024 12:10:11 -0600 Subject: [PATCH 164/243] Call it a PBS timeout instead --- python/TestHarness/schedulers/RunPBS.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 773b98abf767..5b9780767daf 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -68,7 +68,7 @@ def updateHPCJobs(self, active_hpc_jobs): # Job timed out; give this a special timeout status because # it is then marked as recoverable (could try running again) if name == 'JOB_EXEC_KILL_WALLTIME': - job.setStatus(job.timeout, 'TIMEOUT') + job.setStatus(job.timeout, 'PBS TIMEOUT') # Special status where the job failed to start due to a PBS # issue and will be started again, so there's nothing to do elif name in ['JOB_EXEC_HOOK_RERUN', 'JOB_EXEC_RETRY']: From 4f4d6283141dda01d6985082340e752bf9c9a4cd Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 18 Jun 2024 12:28:09 -0600 Subject: [PATCH 165/243] Catch held slurm jobs --- python/TestHarness/schedulers/RunSlurm.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/TestHarness/schedulers/RunSlurm.py b/python/TestHarness/schedulers/RunSlurm.py index 03fec249afeb..0c0b49dee55f 100644 --- a/python/TestHarness/schedulers/RunSlurm.py +++ b/python/TestHarness/schedulers/RunSlurm.py @@ -60,6 +60,11 @@ def updateHPCJobs(self, active_hpc_jobs): if state != 'PENDING' and not hpc_job.getRunning(): self.setHPCJobRunning(hpc_job) + # Fail if the job is held + if state in ['JobHeldUser']: + self.setHPCJobError(hpc_job, f'SLURM STATE {state}', f'has state {state}') + self.setHPCJobDone(hpc_job, 1) + # Job was running and isn't running anymore, so it's done if hpc_job.getRunning() and state not in ['RUNNING', 'COMPLETING']: exit_code = int(status['exitcode']) From 0787744b86890f70c656a23f89e6ffa7918942d0 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 18 Jun 2024 12:33:47 -0600 Subject: [PATCH 166/243] Cleanup the slurm output error state --- python/TestHarness/schedulers/RunSlurm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/TestHarness/schedulers/RunSlurm.py b/python/TestHarness/schedulers/RunSlurm.py index 0c0b49dee55f..fd063b462bd0 100644 --- a/python/TestHarness/schedulers/RunSlurm.py +++ b/python/TestHarness/schedulers/RunSlurm.py @@ -62,7 +62,7 @@ def updateHPCJobs(self, active_hpc_jobs): # Fail if the job is held if state in ['JobHeldUser']: - self.setHPCJobError(hpc_job, f'SLURM STATE {state}', f'has state {state}') + self.setHPCJobError(hpc_job, f'SLURM STATE {state}', f'has state "{state}"') self.setHPCJobDone(hpc_job, 1) # Job was running and isn't running anymore, so it's done @@ -81,7 +81,7 @@ def updateHPCJobs(self, active_hpc_jobs): # went well. If it FAILED, it finished but returned with a # non-zero exit code, which will be handled by the Tester. elif state not in ['FAILED', 'COMPLETED']: - self.setHPCJobError(hpc_job, f'SLURM ERROR: {state}', f'encountered SLURM state {state}') + self.setHPCJobError(hpc_job, f'SLURM ERROR: {state}', f'has state "{state}"') self.setHPCJobDone(hpc_job, exit_code) From c160ecd5a374066ae85e8e92daf5cb1f6eec6f24 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 19 Jun 2024 12:13:30 -0600 Subject: [PATCH 167/243] Pre-submit jobs when possible --- python/TestHarness/StatusSystem.py | 49 +- python/TestHarness/TestHarness.py | 1 + python/TestHarness/runners/HPCRunner.py | 30 +- python/TestHarness/schedulers/RunHPC.py | 602 ++++++++++++--------- python/TestHarness/schedulers/RunPBS.py | 90 +-- python/TestHarness/schedulers/RunSlurm.py | 51 +- python/TestHarness/schedulers/Scheduler.py | 31 +- python/TestHarness/schedulers/hpc_template | 6 + 8 files changed, 507 insertions(+), 353 deletions(-) diff --git a/python/TestHarness/StatusSystem.py b/python/TestHarness/StatusSystem.py index a347abdcb82d..62f40b0385e5 100644 --- a/python/TestHarness/StatusSystem.py +++ b/python/TestHarness/StatusSystem.py @@ -8,6 +8,7 @@ #* https://www.gnu.org/licenses/lgpl-2.1.html from collections import namedtuple +import threading def initStatus(): status = namedtuple('status', 'status color code sort_value') @@ -104,7 +105,10 @@ class StatusSystem(object): running] def __init__(self): + # The underlying status self.__status = self.no_status + # The lock for reading/changing the status + self.__lock = threading.Lock() def createStatus(self, status_key='NA'): """ return a specific status object based on supplied status name """ @@ -114,25 +118,30 @@ def createStatus(self, status_key='NA'): def getStatus(self): """ - Return the status object. + Return the thread-safe status object. """ - return self.__status + with self.__lock: + return self.__status - def getAllStatuses(self): + @staticmethod + def getAllStatuses(): """ return list of named tuples containing all status types """ - return self.__all_statuses + return StatusSystem.__all_statuses - def getFailingStatuses(self): + @staticmethod + def getFailingStatuses(): """ return list of named tuples containing failing status types """ - return self.__exit_nonzero_statuses + return StatusSystem.__exit_nonzero_statuses - def getSuccessStatuses(self): + @staticmethod + def getSuccessStatuses(): """ return list of named tuples containing exit code zero status types """ - return self.__exit_zero_statuses + return StatusSystem.__exit_zero_statuses - def getPendingStatuses(self): + @staticmethod + def getPendingStatuses(): """ return list of named tuples containing pending status types """ - return self.__pending_statuses + return StatusSystem.__pending_statuses def setStatus(self, status=no_status): """ @@ -140,14 +149,16 @@ def setStatus(self, status=no_status): There is a validation check during this process to ensure the named tuple adheres to this class's set statuses. """ - if self.isValid(status): - self.__status = status - else: - raise StatusSystemError('Invalid status! %s' % (str(status))) - return self.__status - - def isValid(self, status): - original = set(self.no_status._asdict().keys()) + with self.__lock: + if self.isValid(status): + self.__status = status + else: + raise StatusSystemError('Invalid status! %s' % (str(status))) + return self.__status + + @staticmethod + def isValid(status): + original = set(StatusSystem.no_status._asdict().keys()) altered = set(status._asdict().keys()) - if not original.difference(altered) or status in self.__all_statuses: + if not original.difference(altered) or status in StatusSystem.getAllStatuses(): return True diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 9a24a1edee9c..7a2d1eb94208 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -1076,6 +1076,7 @@ def parseCLArgs(self, argv): hpcgroup.add_argument('--hpc-apptainer-bindpath', nargs=1, action='store', type=str, dest='hpc_apptainer_bindpath', help='Sets the apptainer bindpath for HPC jobs') hpcgroup.add_argument('--hpc-apptainer-no-home', action='store_true', dest='hpc_apptainer_no_home', help='Passes --no-home to apptainer for HPC jobs') hpcgroup.add_argument('--hpc-project', nargs=1, action='store', dest='hpc_project', type=str, default='moose', metavar='', help='Identify your job(s) with this project (default: %(default)s)') + hpcgroup.add_argument('--hpc-no-hold', nargs=1, action='store', type=bool, default=False, dest='hpc_no_hold', help='Do not pre-create hpc jobs to be held') hpcgroup.add_argument('--pbs-queue', nargs=1, action='store', dest='hpc_queue', type=str, metavar='', help='Submit jobs to the specified queue') # Try to find the terminal size if we can diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index 275b02dbf4b5..37608f3b384e 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -34,22 +34,27 @@ def __init__(self, job, options, run_hpc): self.output_completed = False def spawn(self, timer): - # Rely on the RunHPC object to submit the job - self.hpc_job = self.run_hpc.submitJob(self.job) + # Rely on the RunHPC object to queue the job + self.hpc_job = self.run_hpc.queueJob(self.job) timer.start() def wait(self, timer): + # The states that we should wait on. Anything else should + # be an invalid state for waiting + wait_states = [self.hpc_job.State.held, + self.hpc_job.State.queued, + self.hpc_job.State.running] + # Poll loop waiting for the job to be finished # This gets a structure that represents the job, and the # polling itself is only done on occasion within RunHPC while True: time.sleep(self.job_status_poll_time) - self.exit_code = self.hpc_job.getExitCode() - - # We're done - if self.exit_code is not None: - break + with self.hpc_job.getLock(): + if self.hpc_job.state not in wait_states: + self.exit_code = self.hpc_job.exit_code + break timer.stop() @@ -60,6 +65,10 @@ def wait(self, timer): # HPC scheduler so we have an invalid state for processing # in the Tester if self.job.isFinished(): + # Don't bother if we've been killed + if self.hpc_job.isKilled(): + return + # If we have _some_ output, at least try to load it. However, don't wait # a while for this one. for i in range(int(60 / self.file_completion_poll_time)): @@ -84,6 +93,10 @@ def wait(self, timer): # Wait for all of the files to be available waited_time = 0 while wait_files or incomplete_files: + # Don't bother if we've been killed + if self.hpc_job.isKilled(): + return + # Look for each file for file in wait_files.copy(): if os.path.exists(file) and os.path.isfile(file): @@ -118,7 +131,8 @@ def print_files(files, type): time.sleep(self.file_completion_poll_time) def kill(self): - self.run_hpc.killJob(self.job) + if self.hpc_job: + self.run_hpc.killHPCJob(self.hpc_job) def trySetOutput(self, required=False): """ diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index febde3d46846..a92c43a43884 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -10,32 +10,31 @@ import urllib.parse from RunParallel import RunParallel import threading, os, re, sys, datetime, shlex, socket, threading, time, urllib, contextlib +from enum import Enum import paramiko import jinja2 +import copy from multiprocessing.pool import ThreadPool from TestHarness import util class HPCJob: + # The valid job states for a HPC job + State = Enum('State', ['waiting', 'held', 'queued', 'running', 'done', 'killed']) + """ Structure that represents the cached information about an HPC job """ - def __init__(self, job, id, command): - # The underlying Job (only set on init, _should_ be thread safe) + def __init__(self, job): + # The underlying Job self.job = job - # The job identifier (only set on init, _should_ be thread safe) - self.id = id + # The ID of the HPC job + self.id = None # The command that was ran within the job - self.command = command - # Whether or not this job is done; here done doesn't mean if it - # was successful or not, just if it is not running/queued anymore - self.done = False + self.command = None + # The state that this job is in + self.state = self.State.waiting # The exit code of the command that was ran (if any) self.exit_code = None - # Whether or not this job was killed; used so what we don't - # bother killing a job multiple times - self.killed = False - # Whether or not the job is currently running - self.running = False # Lock for accessing this object self.lock = threading.Lock() @@ -45,47 +44,29 @@ def getLock(self): """ return self.lock - def set(self, **kwargs): - """ - Thread-safe setter. - """ - with self.getLock(): - for key, value in kwargs.items(): - setattr(self, key, value) - - def getExitCode(self): + def get(self, key): """ - Gets the thread-safe exit code. - - This exit code is what is read by the HPCRunner, - which means that it needs to be locked as we're - also updating it at the same time. + Thread-safe getter for a key """ with self.getLock(): - return self.exit_code + return getattr(self, key) - def getRunning(self): + def getState(self): """ - Gets the thread-safe running state. + Thread-safe getter for the state """ - with self.getLock(): - return self.running + return self.get('state') - def getKilled(self): + def isKilled(self): """ - Gets the thread-safe killed state. + Thread-safe getter for whether or not this was killed """ - with self.getLock(): - return self.killed - - def getDone(self): - """ - Gets the thread-safe done state. - """ - with self.getLock(): - return self.done + return self.getState() == self.State.killed class RunHPC(RunParallel): + # The types for the pools for calling HPC commands + CallHPCPoolType = Enum('CallHPCPoolType', ['submit', 'queue', 'status', 'kill']) + """ Base scheduler for jobs that are ran on HPC. """ @@ -106,18 +87,28 @@ def __init__(self, harness, params): # Lock for accessing self.hpc_jobs self.hpc_jobs_lock = threading.Lock() # How often to poll for status updates in getHPCJob() - self.hpc_jobs_update_interval = 10 - # Map of Job -> HPCJob + self.hpc_jobs_update_interval = 5 + # Map of Job ID -> HPCJob self.hpc_jobs = {} # The thread that will update the HPCJobs self.hpc_jobs_updater = None + # The pool of processes for running HPC scheduler commands + # We have a pool so that we don't overwhelm the login node + # with commands, and have a pool for each interaction type + # so that those commands only compete with commands of the + # other type + self.call_hpc_pool = {} + self.call_hpc_pool[self.CallHPCPoolType.submit] = ThreadPool(processes=5) + if not self.options.hpc_no_hold: # only used with holding jobs + self.call_hpc_pool[self.CallHPCPoolType.queue] = ThreadPool(processes=5) + for val in [self.CallHPCPoolType.status, self.CallHPCPoolType.kill]: + self.call_hpc_pool[val] = ThreadPool(processes=1) + # The jump hostname for running commands, if any self.ssh_hosts = self.options.hpc_host # The SSH key to use for connections self.ssh_key_filenames = None - # The pool of processes for running threaded SSH comments - self.ssh_pool = None # The threaded SSHClient objects, mapped by thread identifier # Tuple of (paramiko.SSHClient, str) where str is the hostname self.ssh_clients = None @@ -128,7 +119,7 @@ def __init__(self, harness, params): if self.ssh_hosts: if isinstance(self.ssh_hosts, str): self.ssh_hosts = [self.ssh_hosts] - self.ssh_pool = ThreadPool(processes=5) + self.ssh_clients = {} self.ssh_clients_lock = threading.Lock() @@ -146,8 +137,15 @@ def __init__(self, harness, params): except: pass - # Make sure that we can connect up front - self.callHPC('hostname') + # Make sure that we can call commands up front + for val in self.CallHPCPoolType: + if self.options.hpc_no_hold and val == self.CallHPCPoolType.queue: + continue + self.callHPC(val, 'hostname') + + # Pool for submitJob(), so that we can submit jobs to be + # held in the background without blocking + self.submit_job_pool = None if self.options.hpc_no_hold else ThreadPool(processes=10) if os.environ.get('APPTAINER_CONTAINER'): if not self.ssh_hosts: @@ -228,7 +226,7 @@ def _callSSH(self, command): """ Calls a SSH command. - Should only be used via apply with the self.ssh_pool. + Should only be used via apply with the self.call_hpc_pool. """ client, host = self._getSSHClient() @@ -252,15 +250,18 @@ def _callSSH(self, command): full_command = f"ssh {host} '{command}'" return exit_code, result.rstrip(), full_command - def callHPC(self, command): + def callHPC(self, pool_type, command): """ Wrapper for calling a HPC command (qsub, qstat, etc) that supports SSH-ing to another host as needed when calling from within apptainer + + Requires the "pool" to specify which command pool to use, of the + RunHPC.CallHPCPoolType types. """ if not self.ssh_hosts: raise Exception('HPC not currently supported outside of a container') - return self.ssh_pool.apply(self._callSSH, (command,)) + return self.call_hpc_pool[pool_type].apply(self._callSSH, (command,)) def getJobSlots(self, job): # Jobs only use one slot because they are ran externally @@ -270,168 +271,208 @@ def availableSlots(self, params): # Support managing 250 HPC jobs concurrently return 250, False - def submitJob(self, job): + def submitJob(self, job, hold): """ Method for submitting an HPC job for the given Job. - Returns the job's ID and the command to be ran in the job. - """ - tester = job.getTester() - options = self.options - - submission_script = self.getHPCJobSubmissionPath(job) - output_file = self.getHPCJobOutputPath(job) - - # Clean these two files - for file in [submission_script, output_file]: - if os.path.exists(file): - os.remove(file) - - # Add MOOSE's python path for python scripts - moose_python = os.path.abspath(os.path.join(os.path.abspath(os.path.dirname(__file__)), '../..')) - - # Start building the jinja environment for the submission script - submission_env = {'SCHEDULER_NAME': self.getHPCSchedulerName(), - 'NAME': self.getHPCJobName(job), - 'CWD': tester.getTestDir(), - 'OUTPUT': output_file, - 'SUBMISSION_SCRIPT': submission_script, - 'WALLTIME': str(datetime.timedelta(seconds=tester.getMaxTime())), - 'PROJECT': self.options.hpc_project, - 'TEST_SPEC': tester.getSpecFile(), - 'TEST_NAME': tester.getTestNameShort(), - 'SUBMITTED_HOSTNAME': socket.gethostname(), - 'MOOSE_PYTHONPATH': moose_python, - 'NUM_PROCS': tester.getProcs(options), - 'NUM_THREADS': tester.getThreads(options), - 'ENDING_COMMENT': self.getOutputEndingComment(f'${self.getHPCJobIDVariable()}'), - 'JOB_ID_VARIABLE': self.getHPCJobIDVariable(), - 'PLACE': self.options.hpc_place} - if self.options.hpc_pre_source: - submission_env['SOURCE_FILE'] = options.hpc_pre_source - if self.source_contents: - submission_env['SOURCE_CONTENTS'] = self.source_contents - - # Get the unescaped command - command = tester.getCommand(options) - - # Parse out the mpi command from the command if we're running in apptainer. - # We do this before any of the other escaping - APPTAINER_CONTAINER = os.environ.get('APPTAINER_CONTAINER') - apptainer_command_prefix = '' - if APPTAINER_CONTAINER: - mpi_command = self.parseMPICommand(command) - if mpi_command: - apptainer_command_prefix = mpi_command - command = command.replace(mpi_command, '') - - # Replace newlines, clean up spaces, and encode the command. We encode the - # command here to be able to pass it to a python script to run later without - # dealing with any substitution or evaluation within a shell. Thus, this is - # akin to the SubprocessRunner also running commands. It's a bit complicated, - # but I promise that it's much better than the alternative - command = command.replace('\n', ' ') - command = ' '.join(command.split()) - command_encoded = urllib.parse.quote(command) - - # Script used to decode the command as described above - hpc_run = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'hpc_run.py') - - # Special logic for when we're running with apptainer, in which case - # we need to manipulate the command like such - # Original command: - # New command: apptainer exec /path/to/image '' - if APPTAINER_CONTAINER: - job_command = apptainer_command_prefix - - # The root filesystem path that we're in so that we can be sure to bind - # it into the container, if not already set - if self.options.hpc_apptainer_bindpath: - bindpath = self.options.hpc_apptainer_bindpath - else: - bindpath = '/' + os.path.abspath(tester.getTestDir()).split(os.path.sep)[1] - # The apptainer command that will get sandwiched in the middle - apptainer_command = ['apptainer', 'exec', '-B', bindpath] - if self.options.hpc_apptainer_no_home: - apptainer_command.append('--no-home') - apptainer_command.append(APPTAINER_CONTAINER) - apptainer_command = shlex.join(apptainer_command) + The "hold" flag specifies whether or not to submit + the job in a held state. - # Append the apptainer command along with the command to be ran - job_command += f"{apptainer_command} {hpc_run} {command_encoded}" + Returns the resulting HPCJob. + """ + # If we're submitting this Job to be held, but the Job status isn't + # currently held, it means that we've hit job in the submit_job_pool + # that was submitted previously but has already been set to be skipped + # (likely due to a prereq failure) + # NOTE: This _is_ thread safe because StatusSystem is blocking + if hold and not job.isHold(): + return None - # Set that we're using apptainer - submission_env['USING_APPTAINER'] = '1' - # Not in apptainer, so we can just use the escaped command as is - else: - job_command = f'{hpc_run} {command_encoded}' - - submission_env['COMMAND'] = job_command - - # The output files that we're expected to generate so that the - # HPC job can add a terminator for them so that we can verify - # they are complete on the executing host - additional_output = [] - for file in tester.getOutputFiles(options): - additional_output.append(f'"{os.path.join(tester.getTestDir(), file)}"') - submission_env['ADDITIONAL_OUTPUT_FILES'] = ' '.join(additional_output) - - # Let the derived scheduler add additional variables - self.augmentJobSubmission(submission_env) - - # Build the script - jinja_env = jinja2.Environment() - definition_template = jinja_env.from_string(self.submission_template) - jinja_env.trim_blocks = True - jinja_env.lstrip_blocks = True - script = definition_template.render(**submission_env) - - # Write the script - open(submission_script, 'w').write(script) - - # Submission command. Here we have a simple bash loop - # that will try to wait for the file if it doesn't exist yet - submission_command = self.getHPCSubmissionCommand() - cmd = [f'cd {tester.getTestDir()}', - f'FILE="{submission_script}"', - 'for i in {1..40}', - 'do if [ -e "$FILE" ]', - f'then {self.getHPCSubmissionCommand()} $FILE', - 'exit $?', - 'else sleep 0.25', - 'fi', - 'done', - 'exit 1'] - cmd = '; '.join(cmd) - - # Do the submission; this is thread safe - exit_code, result, full_cmd = self.callHPC(cmd) - - # Set what we've ran for this job so that we can - # potentially get the context in an error - tester.setCommandRan(full_cmd) - - # Nonzero return code - if exit_code != 0: - raise self.CallHPCException(self, f'{submission_command} failed', full_cmd, result) + with self.hpc_jobs_lock: + hpc_job = self.hpc_jobs.get(job.getID()) - # Parse the job ID from the command - job_id = self.parseHPCSubmissionJobID(result) + # Job hasn't been recorded yet; set up with a waiting state + if hpc_job is None: + self.hpc_jobs[job.getID()] = HPCJob(job) + hpc_job = self.hpc_jobs.get(job.getID()) - # Job has been submitted, so set it as queued - # Here we append job_id if the ID is just a number so that it's more - # obvious what it is - job.addCaveats(f'job={job_id}' if job_id.isdigit() else job_id) + with hpc_job.getLock(): + # Job has already been submitted + if hpc_job.state != hpc_job.State.waiting: + return hpc_job - # Setup the job in the status map - with self.hpc_jobs_lock: - if job in self.hpc_jobs: - raise Exception('Job has already been submitted') - hpc_job = HPCJob(job, job_id, job_command) - self.hpc_jobs[job] = hpc_job + tester = job.getTester() + options = self.options + + submission_script = self.getHPCJobSubmissionPath(job) + output_file = self.getHPCJobOutputPath(job) + + # Clean these two files + for file in [submission_script, output_file]: + if os.path.exists(file): + os.remove(file) + + # Add MOOSE's python path for python scripts + moose_python = os.path.abspath(os.path.join(os.path.abspath(os.path.dirname(__file__)), '../..')) + + # Start building the jinja environment for the submission script + submission_env = {'SCHEDULER_NAME': self.getHPCSchedulerName(), + 'NAME': self.getHPCJobName(job), + 'CWD': tester.getTestDir(), + 'OUTPUT': output_file, + 'SUBMISSION_SCRIPT': submission_script, + 'WALLTIME': str(datetime.timedelta(seconds=tester.getMaxTime())), + 'PROJECT': self.options.hpc_project, + 'TEST_SPEC': tester.getSpecFile(), + 'TEST_NAME': tester.getTestNameShort(), + 'SUBMITTED_HOSTNAME': socket.gethostname(), + 'MOOSE_PYTHONPATH': moose_python, + 'NUM_PROCS': tester.getProcs(options), + 'NUM_THREADS': tester.getThreads(options), + 'ENDING_COMMENT': self.getOutputEndingComment(f'${self.getHPCJobIDVariable()}'), + 'JOB_ID_VARIABLE': self.getHPCJobIDVariable(), + 'PLACE': self.options.hpc_place} + if hold: + submission_env['HOLD'] = 1 + if self.options.hpc_pre_source: + submission_env['SOURCE_FILE'] = options.hpc_pre_source + if self.source_contents: + submission_env['SOURCE_CONTENTS'] = self.source_contents + + # Get the unescaped command + command = tester.getCommand(options) + + # Parse out the mpi command from the command if we're running in apptainer. + # We do this before any of the other escaping + APPTAINER_CONTAINER = os.environ.get('APPTAINER_CONTAINER') + apptainer_command_prefix = '' + if APPTAINER_CONTAINER: + mpi_command = self.parseMPICommand(command) + if mpi_command: + apptainer_command_prefix = mpi_command + command = command.replace(mpi_command, '') + + # Replace newlines, clean up spaces, and encode the command. We encode the + # command here to be able to pass it to a python script to run later without + # dealing with any substitution or evaluation within a shell. Thus, this is + # akin to the SubprocessRunner also running commands. It's a bit complicated, + # but I promise that it's much better than the alternative + command = command.replace('\n', ' ') + command = ' '.join(command.split()) + command_encoded = urllib.parse.quote(command) + + # Script used to decode the command as described above + hpc_run = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'hpc_run.py') + + # Special logic for when we're running with apptainer, in which case + # we need to manipulate the command like such + # Original command: + # New command: apptainer exec /path/to/image '' + if APPTAINER_CONTAINER: + job_command = apptainer_command_prefix + + # The root filesystem path that we're in so that we can be sure to bind + # it into the container, if not already set + if self.options.hpc_apptainer_bindpath: + bindpath = self.options.hpc_apptainer_bindpath + else: + bindpath = '/' + os.path.abspath(tester.getTestDir()).split(os.path.sep)[1] + # The apptainer command that will get sandwiched in the middle + apptainer_command = ['apptainer', 'exec', '-B', bindpath] + if self.options.hpc_apptainer_no_home: + apptainer_command.append('--no-home') + apptainer_command.append(APPTAINER_CONTAINER) + apptainer_command = shlex.join(apptainer_command) + + # Append the apptainer command along with the command to be ran + job_command += f"{apptainer_command} {hpc_run} {command_encoded}" + + # Set that we're using apptainer + submission_env['USING_APPTAINER'] = '1' + # Not in apptainer, so we can just use the escaped command as is + else: + job_command = f'{hpc_run} {command_encoded}' + + submission_env['COMMAND'] = job_command + + # The output files that we're expected to generate so that the + # HPC job can add a terminator for them so that we can verify + # they are complete on the executing host + additional_output = [] + for file in tester.getOutputFiles(options): + additional_output.append(f'"{os.path.join(tester.getTestDir(), file)}"') + submission_env['ADDITIONAL_OUTPUT_FILES'] = ' '.join(additional_output) + + # Let the derived scheduler add additional variables + self.augmentJobSubmission(submission_env) + + # Build the script + jinja_env = jinja2.Environment() + definition_template = jinja_env.from_string(self.submission_template) + jinja_env.trim_blocks = True + jinja_env.lstrip_blocks = True + script = definition_template.render(**submission_env) + + # Write the script + open(submission_script, 'w').write(script) + + # Submission command. Here we have a simple bash loop + # that will try to wait for the file if it doesn't exist yet + submission_command = self.getHPCSubmissionCommand() + cmd = [f'cd {tester.getTestDir()}', + f'FILE="{submission_script}"', + 'for i in {1..40}', + 'do if [ -e "$FILE" ]', + f'then {self.getHPCSubmissionCommand()} $FILE', + 'exit $?', + 'else sleep 0.25', + 'fi', + 'done', + 'exit 1'] + cmd = '; '.join(cmd) + + # Do the submission; this is thread safe + exit_code, result, full_cmd = self.callHPC(self.CallHPCPoolType.submit, cmd) + + # Set what we've ran for this job so that we can + # potentially get the context in an error + tester.setCommandRan(full_cmd) + + # Nonzero return code + if exit_code != 0: + raise self.CallHPCException(self, f'{submission_command} failed', full_cmd, result) + + # Parse the job ID from the command + job_id = self.parseHPCSubmissionJobID(result) + + # Job has been submitted, so set it as queued + # Here we append job_id if the ID is just a number so that it's more + # obvious what it is + job.addCaveats(f'job={job_id}' if job_id.isdigit() else job_id) + + # Print the job as it's been submitted + job_status = job.hold if hold else job.queued + self.setAndOutputJobStatus(hpc_job.job, job_status, caveats=True) + + hpc_job.id = job_id + hpc_job.command = job_command + hpc_job.state = hpc_job.State.held if hold else hpc_job.State.queued - # Set the job as queued and print out that it is queued - self.setHPCJobQueued(hpc_job) + return hpc_job + + def queueJob(self, job): + """ + Method for queuing a Job to start. + + Should be called from within the HPCRunner to get a job going. + + If the job is not submitted yet, it will submit it in a + non-held state. If the job is submitted but held, it will + release the job. + """ + # See if the job has been submitted yet in the background + with self.hpc_jobs_lock: + hpc_job = self.hpc_jobs.get(job.getID()) # If the updater hasn't been started yet, start it. # We do this here because it's locked within hpc_jobs_lock @@ -441,7 +482,28 @@ def submitJob(self, job): self.hpc_jobs_updater = threading.Thread(target=self._updateHPCJobs) self.hpc_jobs_updater.start() - return hpc_job + # Job has not been submitted yet, so submit it in non-held state + if hpc_job is None: + return self.submitJob(job, False) + + # Job has been submitted but is held, so queue it + with hpc_job.getLock(): + if hpc_job.state == hpc_job.State.held: + if self.options.hpc_no_hold: + raise Exception('Job should not be held with holding disabled') + + cmd = f'{self.getHPCQueueCommand()} {hpc_job.id}' + exit_code, result, full_cmd = self.callHPC(self.CallHPCPoolType.queue, cmd) + if exit_code != 0: + try: + self.killHPCJob(hpc_job, lock=False) # already locked + except: + pass + raise self.CallHPCException(self, f'{cmd} failed', full_cmd, result) + + self.setHPCJobQueued(hpc_job) + + return hpc_job def augmentJobSubmission(self, submission_env): """ @@ -463,18 +525,31 @@ def _updateHPCJobs(self): try: while True: + # Here we want to store our own list to these objects + # so that we don't hold onto the lock while we loop + # through the jobs with self.hpc_jobs_lock: - active_hpc_jobs = [x for x in self.hpc_jobs.values() if not x.done] - if active_hpc_jobs: - success = self.updateHPCJobs(active_hpc_jobs) - if not success: - if update_jobs_failed: - self.triggerErrorState() - print('ERROR: Failed to get HPC job status') - return - update_jobs_failed = True - else: - update_jobs_failed = False + all_hpc_jobs = [] + for hpc_job in self.hpc_jobs.values(): + all_hpc_jobs.append(hpc_job) + + # Get all of the HPC jobs that are currently active + active_hpc_jobs = [] + for hpc_job in all_hpc_jobs: + with hpc_job.getLock(): + if hpc_job.state in [hpc_job.State.queued, hpc_job.State.running]: + active_hpc_jobs.append(hpc_job) + + if active_hpc_jobs: + success = self.updateHPCJobs(active_hpc_jobs) + if not success: + if update_jobs_failed: + self.triggerErrorState() + print('ERROR: Failed to get HPC job status') + return + update_jobs_failed = True + else: + update_jobs_failed = False # Update on the interval requested, but also make sure # that we're still running @@ -503,12 +578,15 @@ def setHPCJobRunning(self, hpc_job): """ Sets the given HPC job as running. - This should be called within the overridden updateHPCJobs(). + Should be called within a lock for the given HPCJob. + + This should be called within the overridden updateHPCJobs() to + set a HPCJob as running. """ # This is currently thread safe because we only ever change # it within updateJobs(), which is only ever executed serially # within the thread the calls _updateHPCJobs() - hpc_job.set(running=True) + hpc_job.state = hpc_job.State.running # Print out that the job is now running self.setAndOutputJobStatus(hpc_job.job, hpc_job.job.running, caveats=True) @@ -516,28 +594,29 @@ def setHPCJobQueued(self, hpc_job): """ Sets the given HPC job as being queued. + Should be called within a lock for the given HPCJob. + This can be used when the HPC scheduler re-schedules the job. This should be called within the overridden updateHPCJobs(). """ # Guard against setting this as requeued multiple times - if hpc_job.job.getStatus() == hpc_job.job.queued: + if hpc_job.state == hpc_job.State.queued: return + hpc_job.state = hpc_job.State.queued - # This is currently thread safe because we only ever change - # it within updateJobs(), which is only ever executed serially - # within the thread the calls _updateHPCJobs() - hpc_job.set(running=False) # Print out that the job is queued again self.setAndOutputJobStatus(hpc_job.job, hpc_job.job.queued, caveats=True) - def setHPCJobDone(seflf, hpc_job, exit_code): + def setHPCJobDone(self, hpc_job, exit_code): """ Sets the given HPC job as done. - This should be called within the overridden updateHPCJobs(). + This should be called within the overridden updateHPCJobs(), + within a thread lock for that HPCJob. """ - hpc_job.set(running=False, done=True, exit_code=exit_code) + hpc_job.state = hpc_job.State.done + hpc_job.exit_code = exit_code # We've actually ran something now that didn't fail, so update # the command to what was ran there @@ -552,37 +631,53 @@ def buildRunner(self, job, options): def augmentJobs(self, jobs): super().augmentJobs(jobs) - # If a job has its default time, double it. We grant a little more time - # to small jobs on HPC due to slower IO, etc + # Augment only jobs that are to be ran for job in jobs: - tester = job.getTester() - max_time = tester.getMaxTime() - if max_time == tester.getDefaultMaxTime(): - tester.setMaxTime(max_time * 2) - - def killJob(self, job, lock=True): - """Kills a HPC job""" - with self.hpc_jobs_lock if lock else contextlib.suppress(): - hpc_job = self.hpc_jobs.get(job) - if hpc_job is None or hpc_job.getDone() or hpc_job.getKilled(): + if job.isHold(): + # If a job has its default time, double it. We grant a + # little more time to small jobs on HPC due to slower IO, etc + tester = job.getTester() + max_time = tester.getMaxTime() + if max_time == tester.getDefaultMaxTime(): + tester.setMaxTime(max_time * 2) + + # Add the Job to the pool to be submitted as a job in + # a held state. We do this as early as possible so that + # we can get a better priority in the HPC queue. This + # is an asynchronous call so it will happen later when + # available. If the Job actually runs before we have + # a chance to get to this in the pool, when it finally + # executes in the pool, it will do nothing because the + # HPCJob will already exist. + if not self.options.hpc_no_hold: + self.submit_job_pool.apply_async(self.submitJob, (job, True,)) + + def killHPCJob(self, hpc_job, lock=True): + """ + Kills the given HPCJob if it is in a state to be killed. + """ + with hpc_job.getLock() if lock else contextlib.suppress(): + if hpc_job.state in [hpc_job.State.killed, hpc_job.State.done]: return job_id = hpc_job.id - hpc_job.set(killed=True) + hpc_job.state = hpc_job.State.killed # Don't care about whether or not this failed - self.callHPC(f'{self.getHPCCancelCommand()} {job_id}') + self.callHPC(self.CallHPCPoolType.kill, f'{self.getHPCCancelCommand()} {job_id}') def killRemaining(self, keyboard=False): """Kills all currently running HPC jobs""" job_ids = [] with self.hpc_jobs_lock: for hpc_job in self.hpc_jobs.values(): - if not hpc_job.getDone() and not hpc_job.getKilled(): + with hpc_job.getLock(): + if hpc_job.state in [hpc_job.State.killed, hpc_job.State.done]: + continue job_ids.append(hpc_job.id) - hpc_job.set(killed=True) + hpc_job.state = hpc_job.State.killed # Don't care about whether or not this failed - self.callHPC(f'{self.getHPCCancelCommand()} {" ".join(job_ids)}') + self.callHPC(self.CallHPCPoolType.kill, f'{self.getHPCCancelCommand()} {" ".join(job_ids)}') super().killRemaining(keyboard) @@ -603,7 +698,15 @@ def getHPCSubmissionCommand(self): Should be overridden. """ - raise Exception('Unimplemented getHPCSchedulerName()') + raise Exception('Unimplemented getHPCSubmissionCommand()') + + def getHPCQueueCommand(self): + """ + Returns command used for submitting jobs. + + Should be overridden. + """ + raise Exception('Unimplemented getHPCQueueCommand()') def getHPCCancelCommand(self): """ @@ -682,3 +785,10 @@ def setHPCJobError(hpc_job, message, output=None): job.setStatus(job.error, message) if output: job.appendOutput(util.outputHeader(f'Job {hpc_job.id} {output}')) + + def waitFinish(self): + super().waitFinish() + + # Kill everything else that is left, which could be jobs in a held + # state that ended up not running because their dependencies failed + self.killRemaining() diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 5b9780767daf..f112e85bbd4c 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -24,7 +24,7 @@ def augmentJobSubmission(self, submission_env): def updateHPCJobs(self, active_hpc_jobs): # Poll for all of the jobs within a single call cmd = ['qstat', '-xf', '-F', 'json'] + [x.id for x in active_hpc_jobs] - exit_code, result, _ = self.callHPC(' '.join(cmd)) + exit_code, result, _ = self.callHPC(self.CallHPCPoolType.status, ' '.join(cmd)) if exit_code != 0: return False @@ -33,8 +33,6 @@ def updateHPCJobs(self, active_hpc_jobs): job_results = json_result['Jobs'] for hpc_job in active_hpc_jobs: - job = hpc_job.job - # This job's result from the qstat command job_result = job_results[hpc_job.id] exit_code = job_result.get('Exit_status') @@ -42,46 +40,49 @@ def updateHPCJobs(self, active_hpc_jobs): exit_code = int(exit_code) state = job_result.get('job_state') - # The job has switched to running - if state == 'R' and not hpc_job.getRunning(): - self.setHPCJobRunning(hpc_job) - - # The job is held, so we're going to consider it a failure and - # will also try to cancel it so that it doesn't hang around - if state == 'H': - self.setHPCJobError(hpc_job, 'PBS JOB HELD', 'was held; killed job') - exit_code = 1 - try: - self.killJob(job, lock=False) # no lock; we're already in one - except: - pass - - # Job finished before it started, so something killed it - if state == 'F' and exit_code is None: - self.setHPCJobError(hpc_job, 'PBS JOB KILLED', 'was killed') - exit_code = 1 - - # If we were running but now we're done, we're not running anymore - if exit_code is not None: - if exit_code < 0: - name, reason = PBS_User_EXITCODES.get(exit_code, ('TERMINATED', 'Unknown reason')) - # Job timed out; give this a special timeout status because - # it is then marked as recoverable (could try running again) - if name == 'JOB_EXEC_KILL_WALLTIME': - job.setStatus(job.timeout, 'PBS TIMEOUT') - # Special status where the job failed to start due to a PBS - # issue and will be started again, so there's nothing to do - elif name in ['JOB_EXEC_HOOK_RERUN', 'JOB_EXEC_RETRY']: - self.setHPCJobQueued(hpc_job) - continue - # Everything else should be an error - else: - self.setHPCJobError(hpc_job, f'PBS ERROR: {name}', f'was terminated with reason: {reason}') - # Job was killed with a signal - elif exit_code >= 128: - self.setHPCJobError(hpc_job, 'PBS JOB KILLED', 'was killed by a signal') - - self.setHPCJobDone(hpc_job, exit_code) + with hpc_job.getLock(): + job = hpc_job.job + + # The job has switched to running + if state == 'R' and hpc_job.state != hpc_job.State.running: + self.setHPCJobRunning(hpc_job) + + # The job is held, so we're going to consider it a failure and + # will also try to cancel it so that it doesn't hang around + if state == 'H' and (job_result.get('Hold_Types') != 'u' or self.options.hpc_no_hold): + self.setHPCJobError(hpc_job, 'PBS JOB HELD', 'was held; killed job') + exit_code = 1 + try: + self.killHPCJob(hpc_job, lock=False) # no lock; we're already in one + except: + pass + + # Job finished before it started, so something killed it + if state == 'F' and exit_code is None: + self.setHPCJobError(hpc_job, 'PBS JOB KILLED', 'was killed') + exit_code = 1 + + # If we were running but now we're done, we're not running anymore + if exit_code is not None: + if exit_code < 0: + name, reason = PBS_User_EXITCODES.get(exit_code, ('TERMINATED', 'Unknown reason')) + # Job timed out; give this a special timeout status because + # it is then marked as recoverable (could try running again) + if name == 'JOB_EXEC_KILL_WALLTIME': + job.setStatus(job.timeout, 'PBS JOB TIMEOUT') + # Special status where the job failed to start due to a PBS + # issue and will be started again, so there's nothing to do + elif name in ['JOB_EXEC_HOOK_RERUN', 'JOB_EXEC_RETRY']: + self.setHPCJobQueued(hpc_job) + continue + # Everything else should be an error + else: + self.setHPCJobError(hpc_job, f'PBS ERROR: {name}', f'was terminated with reason: {reason}') + # Job was killed with a signal + elif exit_code >= 128: + self.setHPCJobError(hpc_job, 'PBS JOB KILLED', 'was killed by a signal') + + self.setHPCJobDone(hpc_job, exit_code) # Success return True @@ -92,6 +93,9 @@ def getHPCSchedulerName(self): def getHPCSubmissionCommand(self): return 'qsub' + def getHPCQueueCommand(self): + return 'qrls' + def getHPCCancelCommand(self): return 'qdel' diff --git a/python/TestHarness/schedulers/RunSlurm.py b/python/TestHarness/schedulers/RunSlurm.py index fd063b462bd0..3e826f5ee4ee 100644 --- a/python/TestHarness/schedulers/RunSlurm.py +++ b/python/TestHarness/schedulers/RunSlurm.py @@ -26,7 +26,7 @@ def updateHPCJobs(self, active_hpc_jobs): active_job_ids = ','.join([x.id for x in active_hpc_jobs]) cmd = ['sacct', '-j', active_job_ids, '--parsable2', '--noheader', '-o', 'jobid,exitcode,state,reason'] - exit_code, result, _ = self.callHPC(' '.join(cmd)) + exit_code, result, _ = self.callHPC(self.CallHPCPoolType.status, ' '.join(cmd)) if exit_code != 0: return False @@ -55,35 +55,31 @@ def updateHPCJobs(self, active_hpc_jobs): # The slurm job state; see slurm.schedmd.com/squeue.html#lbAG state = status['state'] - # Job wasn't running and it's no longer pending, so it - # is running or has at least ran - if state != 'PENDING' and not hpc_job.getRunning(): - self.setHPCJobRunning(hpc_job) + with hpc_job.getLock(): + # Job wasn't running and it's no longer pending, so it + # is running or has at least ran + if state != 'PENDING' and hpc_job.state != hpc_job.State.running: + self.setHPCJobRunning(hpc_job) - # Fail if the job is held - if state in ['JobHeldUser']: - self.setHPCJobError(hpc_job, f'SLURM STATE {state}', f'has state "{state}"') - self.setHPCJobDone(hpc_job, 1) + # Job was running and isn't running anymore, so it's done + if hpc_job.state == hpc_job.State.running and state not in ['RUNNING', 'COMPLETING']: + exit_code = int(status['exitcode']) + if state == 'FAILED' and exit_code == 0: + raise Exception(f'Job {hpc_job.id} has unexpected exit code {exit_code} with FAILED state') - # Job was running and isn't running anymore, so it's done - if hpc_job.getRunning() and state not in ['RUNNING', 'COMPLETING']: - exit_code = int(status['exitcode']) - if state == 'FAILED' and exit_code == 0: - raise Exception(f'Job {hpc_job.id} has unexpected exit code {exit_code} with FAILED state') + job = hpc_job.job - job = hpc_job.job + # Job has timed out; setting a timeout status means that this + # state is recoverable + if state == 'TIMEOUT': + job.setStatus(job.timeout, 'SLURM JOB TIMEOUT') + # If a job COMPLETED, it's done with exit code 0 so everything + # went well. If it FAILED, it finished but returned with a + # non-zero exit code, which will be handled by the Tester. + elif state not in ['FAILED', 'COMPLETED']: + self.setHPCJobError(hpc_job, f'SLURM ERROR: {state}', f'has state "{state}"') - # Job has timed out; setting a timeout status means that this - # state is recoverable - if state == 'TIMEOUT': - job.setStatus(job.timeout, 'TIMEOUT') - # If a job COMPLETED, it's done with exit code 0 so everything - # went well. If it FAILED, it finished but returned with a - # non-zero exit code, which will be handled by the Tester. - elif state not in ['FAILED', 'COMPLETED']: - self.setHPCJobError(hpc_job, f'SLURM ERROR: {state}', f'has state "{state}"') - - self.setHPCJobDone(hpc_job, exit_code) + self.setHPCJobDone(hpc_job, exit_code) # Success return True @@ -94,6 +90,9 @@ def getHPCSchedulerName(self): def getHPCSubmissionCommand(self): return 'sbatch' + def getHPCQueueCommand(self): + return 'scontrol release' + def getHPCCancelCommand(self): return 'scancel' diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index 0bf800bb34c7..dafdd5401c23 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -190,8 +190,21 @@ def __sortAndLaunch(self): Sort by largest DAG and launch """ sorted_jobs = sorted(self.__dag_bank, key=lambda x: len(x[1].topological_sort()), reverse=True) - for jobs, _ in sorted_jobs: - self.queueJobs(jobs) + for job_dag, _ in sorted_jobs: + # Allow derived schedulers access to the jobs before they launch + # We purposely call this one here so that we augment the Jobs + # in the order that they're launched + self.augmentJobs(job_dag.getJobs()) + # And launch + self.queueJobs(job_dag) + + def outputJobStatus(self, job, caveats=None): + """ + Forces a Job's status to be output asap + """ + with job.getLock(): + job.force_report_status = True + self.handleJobStatus(job, caveats=caveats) def setAndOutputJobStatus(self, job, status, caveats=None): """ @@ -199,8 +212,7 @@ def setAndOutputJobStatus(self, job, status, caveats=None): """ with job.getLock(): job.setStatus(status) - job.force_report_status = True - self.handleJobStatus(job, caveats=caveats) + self.outputJobStatus(job, caveats=caveats) def isRunning(self): """ @@ -271,9 +283,6 @@ def schedule(self, testers): jobs = JobDAG(self.options, parallel_scheduling) j_dag = jobs.createJobs(testers) - # Allow derived schedulers access to the jobs before they launch - self.augmentJobs(jobs.getJobs()) - # job-count to tester-count sanity check if j_dag.size() != len(testers): raise SchedulerError('Scheduler was going to run a different amount of testers than what was received (something bad happened)!') @@ -287,7 +296,7 @@ def schedule(self, testers): # Store all scheduled jobs self.__scheduled_jobs.append(j_dag.topological_sort()) - def queueJobs(self, jobs): + def queueJobs(self, job_dag): """ Determine which queue jobs should enter. Finished jobs are placed in the status pool to be printed while all others are placed in the runner pool to perform work. @@ -296,8 +305,8 @@ def queueJobs(self, jobs): jobs to become available and ready to enter the runner pool (dependency jobs). """ state = self.getStatusPoolState() - with jobs.getLock(): - concurrent_jobs = jobs.getJobsAndAdvance() + with job_dag.getLock(): + concurrent_jobs = job_dag.getJobsAndAdvance() for job in concurrent_jobs: if job.isFinished(): if not state: @@ -306,7 +315,7 @@ def queueJobs(self, jobs): elif job.isHold(): if not state: job.setStatus(job.queued) - self.run_pool.apply_async(self.runJob, (job, jobs,)) + self.run_pool.apply_async(self.runJob, (job, job_dag,)) def getLoad(self): """ Method to return current load average """ diff --git a/python/TestHarness/schedulers/hpc_template b/python/TestHarness/schedulers/hpc_template index de243e8a42d9..daa44e5e8454 100644 --- a/python/TestHarness/schedulers/hpc_template +++ b/python/TestHarness/schedulers/hpc_template @@ -4,6 +4,9 @@ #PBS -l select={{ NUM_PROCS }}:mpiprocs=1:ncpus={{ NUM_THREADS }} #PBS -l walltime={{ WALLTIME }} #PBS -P {{ PROJECT }} +{%- if HOLD is defined %} +#PBS -h +{%- endif %} {%- if QUEUE is defined %} #PBS -q {{ QUEUE }} {%- endif %} @@ -17,6 +20,9 @@ #SBATCH --time={{ WALLTIME }} #SBATCH --wckey={{ PROJECT }} #SBATCH --output={{ OUTPUT }} +{%- if HOLD is defined %} +#SBATCH --hold +{%- endif %} {%- if PLACE == "scatter" %} #SBATCH --ntasks-per-node=1 {%- endif %} From 2fd7e899ef021e860859332a85bd1a35e1f802f0 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 19 Jun 2024 12:21:13 -0600 Subject: [PATCH 168/243] Only kill jobs if there are jobs to kill --- python/TestHarness/schedulers/RunHPC.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index a92c43a43884..95ae5c0f2b00 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -677,7 +677,8 @@ def killRemaining(self, keyboard=False): hpc_job.state = hpc_job.State.killed # Don't care about whether or not this failed - self.callHPC(self.CallHPCPoolType.kill, f'{self.getHPCCancelCommand()} {" ".join(job_ids)}') + if job_ids: + self.callHPC(self.CallHPCPoolType.kill, f'{self.getHPCCancelCommand()} {" ".join(job_ids)}') super().killRemaining(keyboard) From b4fed3a79f64b813ce638d00d1e06c6613f01e32 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 19 Jun 2024 12:21:25 -0600 Subject: [PATCH 169/243] Remove unused method --- python/TestHarness/schedulers/Scheduler.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index dafdd5401c23..dd6fbdbaa533 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -198,21 +198,14 @@ def __sortAndLaunch(self): # And launch self.queueJobs(job_dag) - def outputJobStatus(self, job, caveats=None): - """ - Forces a Job's status to be output asap - """ - with job.getLock(): - job.force_report_status = True - self.handleJobStatus(job, caveats=caveats) - def setAndOutputJobStatus(self, job, status, caveats=None): """ Sets a Job's status and forces the status to be output asap """ + job.setStatus(status) with job.getLock(): - job.setStatus(status) - self.outputJobStatus(job, caveats=caveats) + job.force_report_status = True + self.handleJobStatus(job, caveats=caveats) def isRunning(self): """ From e1422ba86019799d7e2ee7e4af5cadcf08173754 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 19 Jun 2024 12:31:15 -0600 Subject: [PATCH 170/243] Correctly kill the remaining jobs --- python/TestHarness/schedulers/RunHPC.py | 28 ++++++++++++++++--------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 95ae5c0f2b00..b5560db77861 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -665,21 +665,27 @@ def killHPCJob(self, hpc_job, lock=True): # Don't care about whether or not this failed self.callHPC(self.CallHPCPoolType.kill, f'{self.getHPCCancelCommand()} {job_id}') - def killRemaining(self, keyboard=False): - """Kills all currently running HPC jobs""" + def killHPCJobs(self, functor): + """ + Kills the HPC jobs the meet the criteria of the functor. + + The functor should take a single object, the HPCJob, and + should return a bool stating whether or not to kill that job. + """ job_ids = [] with self.hpc_jobs_lock: for hpc_job in self.hpc_jobs.values(): with hpc_job.getLock(): - if hpc_job.state in [hpc_job.State.killed, hpc_job.State.done]: - continue - job_ids.append(hpc_job.id) - hpc_job.state = hpc_job.State.killed + if functor(hpc_job): + job_ids.append(hpc_job.id) - # Don't care about whether or not this failed if job_ids: self.callHPC(self.CallHPCPoolType.kill, f'{self.getHPCCancelCommand()} {" ".join(job_ids)}') + def killRemaining(self, keyboard=False): + """Kills all currently running HPC jobs""" + functor = lambda hpc_job: hpc_job.state not in [hpc_job.State.killed, hpc_job.State.done] + self.killHPCJobs(functor) super().killRemaining(keyboard) def getHPCSchedulerName(self): @@ -790,6 +796,8 @@ def setHPCJobError(hpc_job, message, output=None): def waitFinish(self): super().waitFinish() - # Kill everything else that is left, which could be jobs in a held - # state that ended up not running because their dependencies failed - self.killRemaining() + # Kill the remaining jobs that are held, which would exist if things + # fail and jobs that we pre-submitted were skipped due to a failed + # dependency above them + functor = lambda hpc_job: hpc_job.state == hpc_job.State.held + self.killHPCJobs(functor) From ef52947706f3c2e42d9d2395ce4fc1d0addcc820 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 19 Jun 2024 13:24:28 -0600 Subject: [PATCH 171/243] Process the job updates in chunks --- python/TestHarness/schedulers/RunHPC.py | 55 ++++++++++++++--------- python/TestHarness/schedulers/RunPBS.py | 6 +-- python/TestHarness/schedulers/RunSlurm.py | 6 +-- 3 files changed, 39 insertions(+), 28 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index b5560db77861..fec46f711cf4 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -526,30 +526,41 @@ def _updateHPCJobs(self): try: while True: # Here we want to store our own list to these objects - # so that we don't hold onto the lock while we loop - # through the jobs + # so that we don't hold onto the lock while we work + # on each job individually with self.hpc_jobs_lock: - all_hpc_jobs = [] - for hpc_job in self.hpc_jobs.values(): - all_hpc_jobs.append(hpc_job) + hpc_jobs = [x for x in self.hpc_jobs.values()] # Get all of the HPC jobs that are currently active - active_hpc_jobs = [] - for hpc_job in all_hpc_jobs: - with hpc_job.getLock(): - if hpc_job.state in [hpc_job.State.queued, hpc_job.State.running]: - active_hpc_jobs.append(hpc_job) - - if active_hpc_jobs: - success = self.updateHPCJobs(active_hpc_jobs) - if not success: - if update_jobs_failed: - self.triggerErrorState() - print('ERROR: Failed to get HPC job status') - return - update_jobs_failed = True - else: - update_jobs_failed = False + active_states = [HPCJob.State.queued, HPCJob.State.running] + active_hpc_jobs = [x for x in hpc_jobs if x.getState() in active_states] + + # Helper for splitting a list into chunks. We won't update + # everything together because PBS is particularly bad + # at processing the status for a ton of jobs at once... + def in_chunks(l, N): + for i in range(0, len(l), N): + yield l[i:i + N] + + # Whether or not all of the updates suceeded + success = True + + # Process 50 jobs at a time (thanks PBS) + for chunked_hpc_jobs in in_chunks(active_hpc_jobs, 50): + # Returns whether or not it failed + if not self.updateHPCJobs(chunked_hpc_jobs): + success = False + + # At least one of the updates failed; allow this to + # happen only once + if not success: + if update_jobs_failed: + self.triggerErrorState() + print('ERROR: Failed to get HPC job status') + return + update_jobs_failed = True + else: + update_jobs_failed = False # Update on the interval requested, but also make sure # that we're still running @@ -562,7 +573,7 @@ def _updateHPCJobs(self): self.triggerErrorState() raise - def updateHPCJobs(self, active_hpc_jobs): + def updateHPCJobs(self, hpc_jobs): """ Updates the underlying jobs. diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index f112e85bbd4c..88c3fecd8b28 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -21,9 +21,9 @@ def augmentJobSubmission(self, submission_env): if self.options.hpc_queue: submission_env['QUEUE'] = self.options.hpc_queue - def updateHPCJobs(self, active_hpc_jobs): + def updateHPCJobs(self, hpc_jobs): # Poll for all of the jobs within a single call - cmd = ['qstat', '-xf', '-F', 'json'] + [x.id for x in active_hpc_jobs] + cmd = ['qstat', '-xf', '-F', 'json'] + [x.id for x in hpc_jobs] exit_code, result, _ = self.callHPC(self.CallHPCPoolType.status, ' '.join(cmd)) if exit_code != 0: return False @@ -32,7 +32,7 @@ def updateHPCJobs(self, active_hpc_jobs): json_result = json.loads(result) job_results = json_result['Jobs'] - for hpc_job in active_hpc_jobs: + for hpc_job in hpc_jobs: # This job's result from the qstat command job_result = job_results[hpc_job.id] exit_code = job_result.get('Exit_status') diff --git a/python/TestHarness/schedulers/RunSlurm.py b/python/TestHarness/schedulers/RunSlurm.py index 3e826f5ee4ee..0ee530b975b0 100644 --- a/python/TestHarness/schedulers/RunSlurm.py +++ b/python/TestHarness/schedulers/RunSlurm.py @@ -21,9 +21,9 @@ def __init__(self, harness, params): # Slurm is quite a bit faster at updating self.hpc_jobs_update_interval = 5 - def updateHPCJobs(self, active_hpc_jobs): + def updateHPCJobs(self, hpc_jobs): # Poll for all of the jobs within a single call - active_job_ids = ','.join([x.id for x in active_hpc_jobs]) + active_job_ids = ','.join([x.id for x in hpc_jobs]) cmd = ['sacct', '-j', active_job_ids, '--parsable2', '--noheader', '-o', 'jobid,exitcode,state,reason'] exit_code, result, _ = self.callHPC(self.CallHPCPoolType.status, ' '.join(cmd)) @@ -46,7 +46,7 @@ def updateHPCJobs(self, active_hpc_jobs): 'reason': status_split[3]} # Update the jobs that we can - for hpc_job in active_hpc_jobs: + for hpc_job in hpc_jobs: # Slurm jobs are sometimes not immediately available status = statuses.get(hpc_job.id) if status is None: From 8f29568d9c1e020954d5a74ee4301f442afcae25 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 19 Jun 2024 13:27:19 -0600 Subject: [PATCH 172/243] Make the chunk size a config option and increase it for slurm --- python/TestHarness/schedulers/RunHPC.py | 9 ++++++--- python/TestHarness/schedulers/RunSlurm.py | 5 +++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index fec46f711cf4..8f7d96e2b38c 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -86,8 +86,11 @@ def __init__(self, harness, params): # Lock for accessing self.hpc_jobs self.hpc_jobs_lock = threading.Lock() - # How often to poll for status updates in getHPCJob() + # How often to poll (in sec) for status updates in getHPCJob() self.hpc_jobs_update_interval = 5 + # How many HPC jobs to update at a time in updateHPCJobs() + # This needs to be an option because PBS is awful + self.update_hpc_jobs_chunk_size = 50 # Map of Job ID -> HPCJob self.hpc_jobs = {} # The thread that will update the HPCJobs @@ -545,8 +548,8 @@ def in_chunks(l, N): # Whether or not all of the updates suceeded success = True - # Process 50 jobs at a time (thanks PBS) - for chunked_hpc_jobs in in_chunks(active_hpc_jobs, 50): + # Process a subset of jobs at a time + for chunked_hpc_jobs in in_chunks(active_hpc_jobs, self.update_hpc_jobs_chunk_size): # Returns whether or not it failed if not self.updateHPCJobs(chunked_hpc_jobs): success = False diff --git a/python/TestHarness/schedulers/RunSlurm.py b/python/TestHarness/schedulers/RunSlurm.py index 0ee530b975b0..6e17ec0da8cd 100644 --- a/python/TestHarness/schedulers/RunSlurm.py +++ b/python/TestHarness/schedulers/RunSlurm.py @@ -18,8 +18,9 @@ class RunSlurm(RunHPC): def __init__(self, harness, params): super().__init__(harness, params) - # Slurm is quite a bit faster at updating - self.hpc_jobs_update_interval = 5 + # Slurm is significantly better at job status, so we can + # update all at the same time + self.update_hpc_jobs_chunk_size = 1000 def updateHPCJobs(self, hpc_jobs): # Poll for all of the jobs within a single call From dd599615e44381bfe1fc6c199d50ae510fcfdeb9 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 19 Jun 2024 14:35:42 -0600 Subject: [PATCH 173/243] Simplify chunk size call --- python/TestHarness/schedulers/RunHPC.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 8f7d96e2b38c..06424c24bc34 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -541,7 +541,8 @@ def _updateHPCJobs(self): # Helper for splitting a list into chunks. We won't update # everything together because PBS is particularly bad # at processing the status for a ton of jobs at once... - def in_chunks(l, N): + def in_chunks(l): + N = self.update_hpc_jobs_chunk_size for i in range(0, len(l), N): yield l[i:i + N] @@ -549,7 +550,7 @@ def in_chunks(l, N): success = True # Process a subset of jobs at a time - for chunked_hpc_jobs in in_chunks(active_hpc_jobs, self.update_hpc_jobs_chunk_size): + for chunked_hpc_jobs in in_chunks(active_hpc_jobs): # Returns whether or not it failed if not self.updateHPCJobs(chunked_hpc_jobs): success = False From 982e0943ea920d962585435a77ff64adcd7983a3 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 19 Jun 2024 14:43:03 -0600 Subject: [PATCH 174/243] Don't use thread locking for the Tester's status --- python/TestHarness/StatusSystem.py | 31 ++++++++++++++++++++++------ python/TestHarness/schedulers/Job.py | 2 +- python/TestHarness/testers/Tester.py | 7 +++++-- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/python/TestHarness/StatusSystem.py b/python/TestHarness/StatusSystem.py index 62f40b0385e5..6ea71e73b989 100644 --- a/python/TestHarness/StatusSystem.py +++ b/python/TestHarness/StatusSystem.py @@ -9,6 +9,7 @@ from collections import namedtuple import threading +import contextlib def initStatus(): status = namedtuple('status', 'status color code sort_value') @@ -21,6 +22,8 @@ class StatusSystem(object): """ A Class for supplying statuses, with status text color and corresponding exit codes. + Set locking=True within the initializer to enable thread-safe access. + Syntax: status = StatusSystem() @@ -104,11 +107,23 @@ class StatusSystem(object): queued, running] - def __init__(self): + def __init__(self, locking=False): # The underlying status self.__status = self.no_status - # The lock for reading/changing the status - self.__lock = threading.Lock() + # The lock for reading/changing the status, if any + if locking: + self.__lock = threading.Lock() + else: + self.__lock = None + + def getLock(self): + """ + Gets the thread lock for this system, if any. + + This is safe to use in a with statement even if locking + is not enabled. + """ + return self.__lock if self.__lock else contextlib.suppress() def createStatus(self, status_key='NA'): """ return a specific status object based on supplied status name """ @@ -118,9 +133,11 @@ def createStatus(self, status_key='NA'): def getStatus(self): """ - Return the thread-safe status object. + Return the status object. + + This is thread-safe if initialized with locking=True. """ - with self.__lock: + with self.getLock(): return self.__status @staticmethod @@ -148,8 +165,10 @@ def setStatus(self, status=no_status): Set the current status to status. If status is not supplied, 'no_status' is implied. There is a validation check during this process to ensure the named tuple adheres to this class's set statuses. + + This is thread-safe if initialized with locking=True. """ - with self.__lock: + with self.getLock(): if self.isValid(status): self.__status = status else: diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index e1fb54209d46..5afaa419a676 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -81,7 +81,7 @@ def __init__(self, tester, job_dag, options): self.__job_message = '' ### Enumerate the job statuses we want to use - self.job_status = StatusSystem() + self.job_status = StatusSystem(locking=True) self.hold = self.job_status.hold self.queued = self.job_status.queued diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index daa239779470..f86559886503 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -142,8 +142,11 @@ def __init__(self, name, params): if self.specs["allow_test_objects"]: self.specs["cli_args"].append("--allow-test-objects") - ### Enumerate the tester statuses we want to use - self.test_status = StatusSystem() + # The Tester status; here we do not use locks because we need to + # do deep copy operations of a Tester object, and thread locks + # cannot be deep copied. + self.test_status = StatusSystem(locking=False) + # Enumerate the tester statuses we want to use self.no_status = self.test_status.no_status self.queued = self.test_status.queued self.skip = self.test_status.skip From 29cba71caa2d8cf0ec4f00733068c3498780cd7c Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 20 Jun 2024 07:58:34 -0600 Subject: [PATCH 175/243] Set mvapich options for threading --- python/TestHarness/schedulers/RunHPC.py | 4 ++-- python/TestHarness/schedulers/hpc_template | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 06424c24bc34..6f890926ff7a 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -330,8 +330,8 @@ def submitJob(self, job, hold): 'TEST_NAME': tester.getTestNameShort(), 'SUBMITTED_HOSTNAME': socket.gethostname(), 'MOOSE_PYTHONPATH': moose_python, - 'NUM_PROCS': tester.getProcs(options), - 'NUM_THREADS': tester.getThreads(options), + 'NUM_PROCS': int(tester.getProcs(options)), + 'NUM_THREADS': int(tester.getThreads(options)), 'ENDING_COMMENT': self.getOutputEndingComment(f'${self.getHPCJobIDVariable()}'), 'JOB_ID_VARIABLE': self.getHPCJobIDVariable(), 'PLACE': self.options.hpc_place} diff --git a/python/TestHarness/schedulers/hpc_template b/python/TestHarness/schedulers/hpc_template index daa44e5e8454..76cee048d3ef 100644 --- a/python/TestHarness/schedulers/hpc_template +++ b/python/TestHarness/schedulers/hpc_template @@ -36,6 +36,12 @@ set -e {{ SOURCE_CONTENTS }} {%- endif %} +{%- if NUM_THREADS != 1 %} +# Set mvapich options for threading +export MV2_USE_ALIGNED_ALLOC=1 +export MV2_THREADS_PER_PROCESS={{ NUM_THREADS }} +{%- endif %} + # Add MOOSE's python path for python scripts export PYTHONPATH={{ MOOSE_PYTHONPATH }}:${PYTHONPATH} From cbcfb577066c106ec29dc447818ff7eca28e16ca Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 8 Jul 2024 09:36:56 -0600 Subject: [PATCH 176/243] Use -W force to kill --- python/TestHarness/schedulers/RunPBS.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 88c3fecd8b28..e12564f0998b 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -97,7 +97,7 @@ def getHPCQueueCommand(self): return 'qrls' def getHPCCancelCommand(self): - return 'qdel' + return 'qdel -W force' def getHPCJobIDVariable(self): return 'PBS_JOBID' From 188f58cbdf867810d5c542c7b4001951a102d78f Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Mon, 8 Jul 2024 10:24:14 -0600 Subject: [PATCH 177/243] Use options.hpc and not options.pbs --- python/TestHarness/testers/RunApp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/testers/RunApp.py b/python/TestHarness/testers/RunApp.py index 301712ab8b3f..7a21c97d336c 100644 --- a/python/TestHarness/testers/RunApp.py +++ b/python/TestHarness/testers/RunApp.py @@ -119,7 +119,7 @@ def checkRunnable(self, options): self.setStatus(self.skip) return False - if options.pbs and self.specs.isValid('command_proxy') and os.environ.get('APPTAINER_CONTAINER') is not None: + if options.hpc and self.specs.isValid('command_proxy') and os.environ.get('APPTAINER_CONTAINER') is not None: self.addCaveats('hpc unsupported') self.setStatus(self.skip) return False From c14e16e96c697aa7da00f83ab860c1dc2ecd26c1 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 20 Aug 2024 13:07:02 -0600 Subject: [PATCH 178/243] Transition to using openmpi --- python/TestHarness/schedulers/hpc_source | 3 +-- python/TestHarness/schedulers/hpc_template | 6 ------ python/TestHarness/testers/RunApp.py | 3 ++- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/python/TestHarness/schedulers/hpc_source b/python/TestHarness/schedulers/hpc_source index 36244fdca169..abb65f042821 100644 --- a/python/TestHarness/schedulers/hpc_source +++ b/python/TestHarness/schedulers/hpc_source @@ -1,3 +1,2 @@ export MOOSE_DEV_CONTAINER_MINIMAL_BINDPATH=1 -module load use.moose moose-dev-container -export MV2_ENABLE_AFFINITY=0 +module load use.moose moose-dev-container-openmpi diff --git a/python/TestHarness/schedulers/hpc_template b/python/TestHarness/schedulers/hpc_template index 76cee048d3ef..daa44e5e8454 100644 --- a/python/TestHarness/schedulers/hpc_template +++ b/python/TestHarness/schedulers/hpc_template @@ -36,12 +36,6 @@ set -e {{ SOURCE_CONTENTS }} {%- endif %} -{%- if NUM_THREADS != 1 %} -# Set mvapich options for threading -export MV2_USE_ALIGNED_ALLOC=1 -export MV2_THREADS_PER_PROCESS={{ NUM_THREADS }} -{%- endif %} - # Add MOOSE's python path for python scripts export PYTHONPATH={{ MOOSE_PYTHONPATH }}:${PYTHONPATH} diff --git a/python/TestHarness/testers/RunApp.py b/python/TestHarness/testers/RunApp.py index 7a21c97d336c..0baf63f90c7a 100644 --- a/python/TestHarness/testers/RunApp.py +++ b/python/TestHarness/testers/RunApp.py @@ -254,7 +254,8 @@ def getCommand(self, options): elif nthreads > 1: command = command + ' --n-threads=' + str(nthreads) - if self.force_mpi or ncpus > 1: + # Force mpi, more than 1 core, or openmpi (openmpi requires mpiexec even in serial) + if self.force_mpi or ncpus > 1 or self.hasOpenMPI(): command = f'{self.mpi_command} -n {ncpus} {command}' # Arbitrary proxy command, but keep track of the command so that someone could use it later From b889c8f398b2fa0e6ecb4240aa4cc0a52febbf11 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 20 Aug 2024 16:07:34 -0600 Subject: [PATCH 179/243] Don't run python unit tests on hpc --- python/TestHarness/testers/PythonUnitTest.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/TestHarness/testers/PythonUnitTest.py b/python/TestHarness/testers/PythonUnitTest.py index 82492de131fb..4145d52a140d 100644 --- a/python/TestHarness/testers/PythonUnitTest.py +++ b/python/TestHarness/testers/PythonUnitTest.py @@ -49,10 +49,11 @@ def getCommand(self, options): return cmd + ' '.join(self.specs['cli_args']) def checkRunnable(self, options): - # Can't run within apptainer in parallel because mpiexec needs to be - # executed outside of the apptainer call - if os.environ.get('APPTAINER_CONTAINER') and self.getProcs(options) > 1: - self.addCaveats('PARALLEL APPTAINER') + # Don't run unit tests on HPC. These tests commonly involve running + # an appliacation within a black box script, which we cannot control + # very well within the HPC environment + if options.hpc: + self.addCaveats('hpc unsupported') self.setStatus(self.skip) return False From 8c9808fa570c75d64af85423f0bce6b12376382e Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 20 Aug 2024 16:07:47 -0600 Subject: [PATCH 180/243] Attempt to capture deterministic null output from openmpi on hpc --- python/TestHarness/runners/HPCRunner.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index 37608f3b384e..6977070e1f6c 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -130,6 +130,22 @@ def print_files(files, type): waited_time += self.file_completion_poll_time time.sleep(self.file_completion_poll_time) + # Handle openmpi appending a null character at the end of jobs + # that return a nonzero exit code. We don't know how to fix this + # in openmpi yet, so this is the cleanest way to take care of it. + # We're looking for \n\0##########', which is at the end of the + # apptainer execution within hpc_template. This allows the null + # character check that happens in Runner.finalize() to still + # be valid. + if self.exit_code != 0 and self.job.getTester().hasOpenMPI() and self.output: + prefix = '\n' + null = '\0' + suffix = '##########' + all = f'{prefix}{null}{suffix}' + no_null = f'{prefix}{suffix}' + if all in self.output: + self.output = self.output.replace(all, no_null) + def kill(self): if self.hpc_job: self.run_hpc.killHPCJob(self.hpc_job) From da4dbce6dfee42c538b490d89fa3bcacacc49d10 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 20 Aug 2024 16:08:01 -0600 Subject: [PATCH 181/243] Forcefully run mpiexec even with 1 core on openmpi hpc --- python/TestHarness/testers/RunApp.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/python/TestHarness/testers/RunApp.py b/python/TestHarness/testers/RunApp.py index 0baf63f90c7a..fd071a3d92ec 100644 --- a/python/TestHarness/testers/RunApp.py +++ b/python/TestHarness/testers/RunApp.py @@ -197,7 +197,13 @@ def getCommand(self, options): # all other TestHarness supplied options. if specs['no_additional_cli_args']: # TODO: Do error checking for TestHarness options that will be silently ignored - return os.path.join(specs['test_dir'], specs['executable']) + ' ' + ' '.join(specs['cli_args']) + cmd = os.path.join(specs['test_dir'], specs['executable']) + ' ' + ' '.join(specs['cli_args']) + + # Need to run mpiexec with containerized openmpi + if options.hpc and self.hasOpenMPI(): + cmd = f'mpiexec -n 1 {cmd}' + + return cmd # Create the additional command line arguments list cli_args = list(specs['cli_args']) @@ -254,8 +260,8 @@ def getCommand(self, options): elif nthreads > 1: command = command + ' --n-threads=' + str(nthreads) - # Force mpi, more than 1 core, or openmpi (openmpi requires mpiexec even in serial) - if self.force_mpi or ncpus > 1 or self.hasOpenMPI(): + # Force mpi, more than 1 core, or containerized openmpi (requires mpiexec serial) + if self.force_mpi or ncpus > 1 or (options.hpc and self.hasOpenMPI()): command = f'{self.mpi_command} -n {ncpus} {command}' # Arbitrary proxy command, but keep track of the command so that someone could use it later From 6116b25e216fa64d2f85255a04b4bb1c8ee92240 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 20 Aug 2024 16:08:55 -0600 Subject: [PATCH 182/243] Remove redundant test This is not a valid requirement for moose. The requirements should be software requirements, not environment requirements --- test/tests/misc/mpi_setup/tests | 9 --------- 1 file changed, 9 deletions(-) delete mode 100644 test/tests/misc/mpi_setup/tests diff --git a/test/tests/misc/mpi_setup/tests b/test/tests/misc/mpi_setup/tests deleted file mode 100644 index c5ce954d2521..000000000000 --- a/test/tests/misc/mpi_setup/tests +++ /dev/null @@ -1,9 +0,0 @@ -[Tests] - [basic_mpirun_works] - type = RunCommand - command = 'mpirun -n 2 hostname' - requirement = "The system's test suite shall verify that the parallel environment is at least minimally working." - issues = '#22635' - design = 'MooseUtils.md' - [] -[] From c0c7e7ea72ba374ba53624f56b36e459e454e845 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 20 Aug 2024 19:45:47 -0600 Subject: [PATCH 183/243] Correct for openmpi null terminator --- python/TestHarness/runners/SubprocessRunner.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/python/TestHarness/runners/SubprocessRunner.py b/python/TestHarness/runners/SubprocessRunner.py index 4d8639b0e4be..ace8e03607db 100644 --- a/python/TestHarness/runners/SubprocessRunner.py +++ b/python/TestHarness/runners/SubprocessRunner.py @@ -79,11 +79,24 @@ def wait(self, timer): self.exit_code = self.process.poll() + # Header for the combined output self.output = util.outputHeader('Begin combined stdout+stderr output') + + # Load combined output for file in [self.outfile, self.errfile]: file.flush() self.output += self.readOutput(file) file.close() + + # For some reason openmpi will append a null character at the end + # when the exit code is nonzero. Not sure why this is... but remove + # it until we figure out what's broken + if file == self.errfile and self.exit_code != 0 \ + and self.job.getTester().hasOpenMPI() and len(self.output) > 2 \ + and self.output[-3:] == '\n\0\n': + self.output = self.output[0:-2] + + # Footer for the combined output self.output += util.outputHeader('End combined stderr+stdout output') def kill(self): From c9bf42fc450e9c4caff5d910751233ade65c8cbc Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 21 Aug 2024 12:25:47 -0600 Subject: [PATCH 184/243] Rework Job timing and add times to PBS jobs --- python/TestHarness/TestHarness.py | 5 + python/TestHarness/runners/HPCRunner.py | 6 +- .../TestHarness/runners/SubprocessRunner.py | 4 +- python/TestHarness/schedulers/Job.py | 229 +++++++++++------- python/TestHarness/schedulers/RunHPC.py | 47 +++- python/TestHarness/schedulers/RunPBS.py | 48 +++- python/TestHarness/schedulers/Scheduler.py | 4 + 7 files changed, 239 insertions(+), 104 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 7a2d1eb94208..97eae3d634ba 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -704,6 +704,11 @@ def cleanup(self): timing_avg = 0 print(('Ran %d tests in %.1f seconds. Average test time %.1f seconds, maximum test time %.1f seconds.' % (self.num_passed+self.num_failed, time, timing_avg, timing_max))) + # Get additional results from the scheduler + scheduler_summary = self.scheduler.additionalResultSummary() + if scheduler_summary: + print(scheduler_summary) + if self.num_passed: summary = '%d passed' else: diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index 6977070e1f6c..05cf0af4dd3e 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -37,8 +37,6 @@ def spawn(self, timer): # Rely on the RunHPC object to queue the job self.hpc_job = self.run_hpc.queueJob(self.job) - timer.start() - def wait(self, timer): # The states that we should wait on. Anything else should # be an invalid state for waiting @@ -56,8 +54,6 @@ def wait(self, timer): self.exit_code = self.hpc_job.exit_code break - timer.stop() - # The PBS output (stdout+stderr) output_file = self.run_hpc.getHPCJobOutputPath(self.job) @@ -91,6 +87,7 @@ def wait(self, timer): incomplete_files = set() # Wait for all of the files to be available + timer.start('hpc_wait_output') waited_time = 0 while wait_files or incomplete_files: # Don't bother if we've been killed @@ -129,6 +126,7 @@ def print_files(files, type): waited_time += self.file_completion_poll_time time.sleep(self.file_completion_poll_time) + timer.stop('hpc_wait_output') # Handle openmpi appending a null character at the end of jobs # that return a nonzero exit code. We don't know how to fix this diff --git a/python/TestHarness/runners/SubprocessRunner.py b/python/TestHarness/runners/SubprocessRunner.py index ace8e03607db..716705a367d5 100644 --- a/python/TestHarness/runners/SubprocessRunner.py +++ b/python/TestHarness/runners/SubprocessRunner.py @@ -70,12 +70,12 @@ def spawn(self, timer): except Exception as e: raise Exception('Error in launching a new task') from e - timer.start() + timer.start('runner_run') def wait(self, timer): self.process.wait() - timer.stop() + timer.stop('runner_run') self.exit_code = self.process.poll() diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 5afaa419a676..e4a7b3e29208 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -14,34 +14,101 @@ from TestHarness.runners.Runner import Runner from TestHarness import util from tempfile import TemporaryDirectory +from collections import namedtuple import traceback +def time_now(): + return time.time_ns() / (10 ** 9) + class Timer(object): """ A helper class for testers to track the time it takes to run. - - Every call to the start method must be followed by a call to stop. """ def __init__(self): - self.starts = [] - self.ends = [] - def start(self): - """ starts the timer clock """ - self.starts.append(clock()) - def stop(self): - """ stop/pauses the timer clock """ - self.ends.append(clock()) - def cumulativeDur(self): - """ returns the total/cumulative time taken by the timer """ - diffs = [end - start for start, end in zip(self.starts, self.ends)] - return sum(diffs) - def averageDur(self): - return self.cumulativeDur() / len(self.starts) - def nRuns(self): - return len(self.starts) - def reset(self): - self.starts = [] - self.ends = [] + # Dict of time name -> (start,) or (start,end) + self.times = {} + # Threading lock for setting timers + self.lock = threading.Lock() + + @staticmethod + def time_now() -> float: + """ Helper for getting a precise now time """ + return float(time.time_ns() / (10 ** 9)) + + def start(self, name: str, at_time=None): + """ Start the given timer """ + if not at_time: + at_time = self.time_now() + with self.lock: + self.times[name] = [at_time] + + def stop(self, name: str, at_time=None): + """ End the given timer """ + if not at_time: + at_time = self.time_now() + with self.lock: + entry = self.times.get(name) + if not entry: + raise Exception(f'Missing time entry {name}') + + if len(entry) > 1: + raise Exception(f'Time entry {name} already stopped') + entry.append(at_time) + + def startMain(self): + """ Get the start time for the main timer """ + self.start('main') + + def stopMain(self): + """ Get the end time for the main timer """ + self.stop('main') + + def hasTime(self, name: str): + """ Whether or not the given timer exists """ + with self.lock: + return name in self.times + + def hasTotalTime(self, name: str): + """ Whether or not the given total time exists """ + with self.lock: + entry = self.times.get(name) + if not entry: + return False + return len(entry) > 1 + + def totalTime(self, name='main'): + """ Get the total time for the given timer """ + with self.lock: + entry = self.times.get(name) + if not entry: + if name == 'main': + return 0 + raise Exception(f'Missing time entry {name}') + + if len(entry) > 1: + return entry[1] - entry[0] + return time_now() - entry[0] + + def totalTimes(self): + """ Get the total times """ + times = {} + for name, entry in self.times.items(): + times[name] = self.totalTime(name) + return times + + class TimeManager: + """ Context manager for timing a section """ + def __init__(self, timer, name: str): + self.timer = timer + self.name = name + def __enter__(self): + self.timer.start(self.name) + def __exit__(self, exc_type, exc_val, exc_tb): + self.timer.stop(self.name) + + def time(self, name: str): + """ Time a section using a context manager """ + return self.TimeManager(self, name) class Job(object): """ @@ -59,8 +126,6 @@ def __init__(self, tester, job_dag, options): self.specs = tester.specs self.__job_dag = job_dag self.timer = Timer() - self.__start_time = clock() - self.__end_time = None self.__previous_time = None self.__joined_out = '' self.report_timer = None @@ -279,87 +344,102 @@ def run(self): """ tester = self.__tester - # Helper for trying and catching - def try_catch(do, exception_name): - try: - do() - except Exception: + # Start the main timer for running + self.timer.startMain() + + # Helper for exiting + def cleanup(): + with self.timer.time('job_cleanup'): self.cleanup() - self.setStatus(self.error, f'{exception_name} EXCEPTION') - self.output += util.outputHeader('Python exception encountered') - self.output += traceback.format_exc() - return False - return True + self.timer.stopMain() + + # Helper for trying and catching + def try_catch(do, exception_name, timer_name): + with self.timer.time(timer_name): + failed = False + try: + do() + except: + trace = traceback.format_exc() + self.setStatus(self.error, f'{exception_name} EXCEPTION') + self.output += util.outputHeader('Python exception encountered') + self.output += trace + failed = True + + if failed: + cleanup() + return not failed # Do not execute app, but still run the tester # This is truly awful and I really hate that it got put in here, # please remove it if you can. if not tester.shouldExecute(): run_tester = lambda: tester.run(self.options, 0, '') - try_catch(run_tester, 'TESTER RUN') + try_catch(run_tester, 'TESTER RUN', 'tester_run') return if self.options.pedantic_checks and self.canParallel(): # Before the job does anything, get the times files below it were last modified - self.fileChecker.get_all_files(self, self.fileChecker.getOriginalTimes()) - self.addCaveats('pedantic check') - time.sleep(1) + with self.timer.time('pedantic_init'): + self.fileChecker.get_all_files(self, self.fileChecker.getOriginalTimes()) + self.addCaveats('pedantic check') + time.sleep(1) - tester.prepare(self.options) + with self.timer.time('tester_prepare'): + tester.prepare(self.options) # Verify that the working directory is available right before we execute if not os.path.exists(tester.getTestDir()): self.setStatus(self.error, 'WORKING DIRECTORY NOT FOUND') + cleanup() return # Getting the command can also cause a failure, so try that tester.getCommand(self.options) if tester.isError(): + cleanup() return - self.timer.reset() - - self.__start_time = clock() - # Spawn the process spawn = lambda: self._runner.spawn(self.timer) - if not try_catch(spawn, 'RUNNER SPAWN'): + if not try_catch(spawn, 'RUNNER SPAWN', 'runner_spawn'): return # Entry point for testers to do other things post_spawn = lambda: tester.postSpawn(self._runner) - if not try_catch(post_spawn, 'TESTER POST SPAWN'): + if not try_catch(post_spawn, 'TESTER POST SPAWN', 'tester_post_spawn'): return # And wait for it to complete wait = lambda: self._runner.wait(self.timer) - if not try_catch(wait, 'RUNNER WAIT'): + if not try_catch(wait, 'RUNNER WAIT', 'runner_wait'): return - self.__start_time = self.timer.starts[0] - self.__end_time = self.timer.ends[-1] - # Job error occurred, which means the Runner didn't complete # so don't process anything else if self.isError(): - self.cleanup() + cleanup() return # And do finalize (really just cleans up output) - self._runner.finalize() + runner_finalize = lambda: self._runner.finalize() + if not try_catch(runner_finalize, 'RUNNER FINALIZE', 'runner_finalize'): + return + # Check if the files we checked on earlier were modified. if self.options.pedantic_checks and self.canParallel(): - # Check if the files we checked on earlier were modified. - self.fileChecker.get_all_files(self, self.fileChecker.getNewTimes()) - self.modifiedFiles = self.fileChecker.check_changes(self.fileChecker.getOriginalTimes(), self.fileChecker.getNewTimes()) + with self.timer.time('pedantic_check'): + self.fileChecker.get_all_files(self, self.fileChecker.getNewTimes()) + self.modifiedFiles = self.fileChecker.check_changes(self.fileChecker.getOriginalTimes(), + self.fileChecker.getNewTimes()) # Allow derived proccessResults to process the output and set a failing status (if it failed) runner_output = self._runner.getOutput() exit_code = self._runner.getExitCode() run_tester = lambda: tester.run(self.options, exit_code, runner_output) - try_catch(run_tester, 'TESTER RUN') + try_catch(run_tester, 'TESTER RUN', 'tester_run') # Run cleanup now that we're done - self.cleanup() + cleanup() def killProcess(self): """ Kill remaining process that may be running """ @@ -370,14 +450,6 @@ def killProcess(self): pass self.cleanup() - def getStartTime(self): - """ Return the time the process started """ - return self.__start_time - - def getEndTime(self): - """ Return the time the process exited """ - return self.__end_time - def getOutput(self): """ Return the combined contents of output """ # Cached output is used when reading from a results file, @@ -421,18 +493,6 @@ def getOutputFile(self): def appendOutput(self, output): self.output += output - def getActiveTime(self): - """ Return active time """ - m = re.search(r"Active time=(\S+)", self.getOutput()) - if m != None: - return float(m.group(1)) - - def getSolveTime(self): - """ Return solve time """ - m = re.search(r"solve().*", self.getOutput()) - if m != None: - return m.group().split()[5] - def setPreviousTime(self, t): """ Allow an arbitrary time to be set. This is used by the QueueManager @@ -442,17 +502,16 @@ def setPreviousTime(self, t): def getTiming(self): """ Return active time if available, if not return a comparison of start and end time """ - if self.getActiveTime(): - return self.getActiveTime() - elif self.getEndTime() and self.getStartTime(): - return self.timer.cumulativeDur() - elif self.getStartTime() and self.isRunning(): - # If the test is still running, return current run time instead - return max(0.0, clock() - self.getStartTime()) - elif self.__previous_time: + # Actual execution time + if self.timer.hasTime('runner_run'): + return self.timer.totalTime('runner_run') + # Job has started + if self.timer.hasTime('main'): + return self.timer.totalTime() + # Previous time is set + if self.__previous_time: return self.__previous_time - else: - return 0.0 + return 0.0 def getStatus(self): return self.job_status.getStatus() diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 6f890926ff7a..b9667b5536dd 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -14,6 +14,7 @@ import paramiko import jinja2 import copy +import statistics from multiprocessing.pool import ThreadPool from TestHarness import util @@ -437,6 +438,10 @@ def submitJob(self, job, hold): # Do the submission; this is thread safe exit_code, result, full_cmd = self.callHPC(self.CallHPCPoolType.submit, cmd) + # Start the queued timer if needed + if not hold: + job.timer.start('hpc_queued') + # Set what we've ran for this job so that we can # potentially get the context in an error tester.setCommandRan(full_cmd) @@ -504,6 +509,9 @@ def queueJob(self, job): pass raise self.CallHPCException(self, f'{cmd} failed', full_cmd, result) + # Start the timer now that we've queued it + hpc_job.job.timer.start('hpc_queued') + self.setHPCJobQueued(hpc_job) return hpc_job @@ -589,7 +597,7 @@ def updateHPCJobs(self, hpc_jobs): """ raise Exception('Unimplemented updateHPCJobs()') - def setHPCJobRunning(self, hpc_job): + def setHPCJobRunning(self, hpc_job, start_time): """ Sets the given HPC job as running. @@ -598,10 +606,19 @@ def setHPCJobRunning(self, hpc_job): This should be called within the overridden updateHPCJobs() to set a HPCJob as running. """ + job = hpc_job.job + # This is currently thread safe because we only ever change # it within updateJobs(), which is only ever executed serially # within the thread the calls _updateHPCJobs() hpc_job.state = hpc_job.State.running + + # The job is no longer queued as of when it started + if job.timer.hasTime('hpc_queued'): + job.timer.stop('hpc_queued', start_time) + # The runner job (actual walltime for the exec) as of when it started + job.timer.start('runner_run', start_time) + # Print out that the job is now running self.setAndOutputJobStatus(hpc_job.job, hpc_job.job.running, caveats=True) @@ -623,19 +640,24 @@ def setHPCJobQueued(self, hpc_job): # Print out that the job is queued again self.setAndOutputJobStatus(hpc_job.job, hpc_job.job.queued, caveats=True) - def setHPCJobDone(self, hpc_job, exit_code): + def setHPCJobDone(self, hpc_job, exit_code, end_time): """ Sets the given HPC job as done. This should be called within the overridden updateHPCJobs(), within a thread lock for that HPCJob. """ + job = hpc_job.job + hpc_job.state = hpc_job.State.done hpc_job.exit_code = exit_code + # The runner job (actual walltime for the exec) ends when it stopped + if job.timer.hasTime('runner_run'): + job.timer.stop('runner_run', end_time) + # We've actually ran something now that didn't fail, so update # the command to what was ran there - job = hpc_job.job if not job.isError(): job.getTester().setCommandRan(hpc_job.command) @@ -816,3 +838,22 @@ def waitFinish(self): # dependency above them functor = lambda hpc_job: hpc_job.state == hpc_job.State.held self.killHPCJobs(functor) + + def additionalResultSummary(self): + timer_keys = ['hpc_queued', 'hpc_wait_output'] + times = {} + for key in timer_keys: + times[key] = [] + + for hpc_job in self.hpc_jobs.values(): + timer = hpc_job.job.timer + for key in timer_keys: + if timer.hasTotalTime(key): + times[key].append(timer.totalTime(key)) + + avg_queued = statistics.mean(times['hpc_queued']) + avg_wait_output = statistics.mean(times['hpc_wait_output']) + + result = f'Average queued time {avg_queued:.1f} seconds, ' + result += f'average output wait time {avg_wait_output:.1f} seconds.' + return result diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index e12564f0998b..f222b220bcf7 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -8,6 +8,7 @@ #* https://www.gnu.org/licenses/lgpl-2.1.html import re, json +from datetime import datetime from RunHPC import RunHPC from PBScodes import PBS_User_EXITCODES from TestHarness import util @@ -36,22 +37,39 @@ def updateHPCJobs(self, hpc_jobs): # This job's result from the qstat command job_result = job_results[hpc_job.id] exit_code = job_result.get('Exit_status') - if exit_code is not None: - exit_code = int(exit_code) state = job_result.get('job_state') + obittime = job_result.get('obittime') with hpc_job.getLock(): job = hpc_job.job - # The job has switched to running - if state == 'R' and hpc_job.state != hpc_job.State.running: - self.setHPCJobRunning(hpc_job) + # Helper for parsing timings + def parse_time(name, graceful=False): + time_format = '%a %b %d %H:%M:%S %Y' + entry = job_result.get(name) + if not entry: + if not graceful: + self.setHPCJobError(hpc_job, 'FAILED TO GET TIMING', + f'Failed to get time entry "{name}"') + return None + + try: + return datetime.strptime(entry, '%a %b %d %H:%M:%S %Y').timestamp() + except: + self.setHPCJobError(hpc_job, 'FAILED TO PARSE TIMING', + f'Failed to parse time "{time}" from entry "{name}"') + return None + + # Job is queued and it has switched to running + if hpc_job.state == hpc_job.State.queued: + start_time = parse_time('stime', True) + if start_time: + self.setHPCJobRunning(hpc_job, start_time) # The job is held, so we're going to consider it a failure and # will also try to cancel it so that it doesn't hang around if state == 'H' and (job_result.get('Hold_Types') != 'u' or self.options.hpc_no_hold): self.setHPCJobError(hpc_job, 'PBS JOB HELD', 'was held; killed job') - exit_code = 1 try: self.killHPCJob(hpc_job, lock=False) # no lock; we're already in one except: @@ -60,10 +78,14 @@ def updateHPCJobs(self, hpc_jobs): # Job finished before it started, so something killed it if state == 'F' and exit_code is None: self.setHPCJobError(hpc_job, 'PBS JOB KILLED', 'was killed') - exit_code = 1 - # If we were running but now we're done, we're not running anymore - if exit_code is not None: + # If we have a finished time or an error, we're done + if obittime or job.getStatus() == job.error: + if exit_code is not None: + exit_code = int(exit_code) + else: + exit_code = 1 + if exit_code < 0: name, reason = PBS_User_EXITCODES.get(exit_code, ('TERMINATED', 'Unknown reason')) # Job timed out; give this a special timeout status because @@ -82,7 +104,13 @@ def updateHPCJobs(self, hpc_jobs): elif exit_code >= 128: self.setHPCJobError(hpc_job, 'PBS JOB KILLED', 'was killed by a signal') - self.setHPCJobDone(hpc_job, exit_code) + # Parse end time if possible + end_time = None + if obittime: + end_time = parse_time('obittime') + + # Parse the queued time + self.setHPCJobDone(hpc_job, exit_code, end_time) # Success return True diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index dd6fbdbaa533..f3b6fe460d97 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -548,3 +548,7 @@ def runJob(self, job, jobs): except KeyboardInterrupt: self.killRemaining(keyboard=True) + + def additionalResultSummary(self): + """ Entrypoint to add additional results to the final summary """ + return None From b2457928ee6621fce977d0d6640ac508b6cf932c Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 21 Aug 2024 13:17:41 -0600 Subject: [PATCH 185/243] Account for wrongly negative queue times on slurm --- python/TestHarness/schedulers/Job.py | 8 ++++++++ python/TestHarness/schedulers/RunHPC.py | 12 +++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index e4a7b3e29208..17d547855af5 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -96,6 +96,14 @@ def totalTimes(self): times[name] = self.totalTime(name) return times + def startTime(self, name): + """ Get the start time """ + with self.lock: + entry = self.times.get(name) + if not entry: + raise Exception(f'Missing time entry {name}') + return entry[0] + class TimeManager: """ Context manager for timing a section """ def __init__(self, timer, name: str): diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index b9667b5536dd..381ffd144a66 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -607,6 +607,7 @@ def setHPCJobRunning(self, hpc_job, start_time): set a HPCJob as running. """ job = hpc_job.job + timer = job.timer # This is currently thread safe because we only ever change # it within updateJobs(), which is only ever executed serially @@ -614,10 +615,15 @@ def setHPCJobRunning(self, hpc_job, start_time): hpc_job.state = hpc_job.State.running # The job is no longer queued as of when it started - if job.timer.hasTime('hpc_queued'): - job.timer.stop('hpc_queued', start_time) + if timer.hasTime('hpc_queued'): + queued_start_time = timer.startTime('hpc_queued') + # This can happen on slurm in < 1s, which could give us negatives + if start_time < queued_start_time: + timer.stop('hpc_queued', queued_start_time) + else: + timer.stop('hpc_queued', start_time) # The runner job (actual walltime for the exec) as of when it started - job.timer.start('runner_run', start_time) + timer.start('runner_run', start_time) # Print out that the job is now running self.setAndOutputJobStatus(hpc_job.job, hpc_job.job.running, caveats=True) From 7ffa280a0d10a7ee9d9bb450d4e170639721b1c3 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 21 Aug 2024 13:19:50 -0600 Subject: [PATCH 186/243] Use walltime for PBS timing instead --- python/TestHarness/schedulers/RunPBS.py | 47 +++++++++++++++++-------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index f222b220bcf7..20e5e8d70674 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -37,6 +37,8 @@ def updateHPCJobs(self, hpc_jobs): # This job's result from the qstat command job_result = job_results[hpc_job.id] exit_code = job_result.get('Exit_status') + if exit_code is not None: + exit_code = int(exit_code) state = job_result.get('job_state') obittime = job_result.get('obittime') @@ -44,13 +46,10 @@ def updateHPCJobs(self, hpc_jobs): job = hpc_job.job # Helper for parsing timings - def parse_time(name, graceful=False): + def parse_time(name): time_format = '%a %b %d %H:%M:%S %Y' entry = job_result.get(name) if not entry: - if not graceful: - self.setHPCJobError(hpc_job, 'FAILED TO GET TIMING', - f'Failed to get time entry "{name}"') return None try: @@ -62,7 +61,7 @@ def parse_time(name, graceful=False): # Job is queued and it has switched to running if hpc_job.state == hpc_job.State.queued: - start_time = parse_time('stime', True) + start_time = parse_time('stime') if start_time: self.setHPCJobRunning(hpc_job, start_time) @@ -70,6 +69,7 @@ def parse_time(name, graceful=False): # will also try to cancel it so that it doesn't hang around if state == 'H' and (job_result.get('Hold_Types') != 'u' or self.options.hpc_no_hold): self.setHPCJobError(hpc_job, 'PBS JOB HELD', 'was held; killed job') + exit_code = 1 try: self.killHPCJob(hpc_job, lock=False) # no lock; we're already in one except: @@ -78,14 +78,10 @@ def parse_time(name, graceful=False): # Job finished before it started, so something killed it if state == 'F' and exit_code is None: self.setHPCJobError(hpc_job, 'PBS JOB KILLED', 'was killed') + exit_code = 1 # If we have a finished time or an error, we're done - if obittime or job.getStatus() == job.error: - if exit_code is not None: - exit_code = int(exit_code) - else: - exit_code = 1 - + if exit_code is not None: if exit_code < 0: name, reason = PBS_User_EXITCODES.get(exit_code, ('TERMINATED', 'Unknown reason')) # Job timed out; give this a special timeout status because @@ -104,12 +100,33 @@ def parse_time(name, graceful=False): elif exit_code >= 128: self.setHPCJobError(hpc_job, 'PBS JOB KILLED', 'was killed by a signal') - # Parse end time if possible + # Parse walltime if possible + walltime_sec = None + resources_used = job_result.get('resources_used') + if resources_used: + walltime = resources_used.get('walltime') + if walltime: + search = re.search(f'^(\d{2}):(\d{2}):(\d{2})$', walltime) + if search: + walltime_sec = datetime.timedelta(hours=int(search.group(1)), + minutes=int(search.group(2)), + seconds=int(search.group(3))).total_seconds() + else: + self.setHPCJobError(hpc_job, 'WALLTIME PARSE ERROR', + f'Failed to parse walltime from {walltime}') + + + # Set end time if possible. PBS has an 'obittime' entry that can be used, + # but it includes a ton of cleanup time that isn't representative of the + # actual time. Here, we'll cheat and just append to the start time the + # walltime to determine the end time, if possible. If not possible, it'll + # just use the time for now which is fine. end_time = None - if obittime: - end_time = parse_time('obittime') + if walltime_sec: + start_time = parse_time('stime') + if start_time: + end_time = start_time + walltime_sec - # Parse the queued time self.setHPCJobDone(hpc_job, exit_code, end_time) # Success From 82ae60cc17738a9f8e17fd04ff68f6b5a8863ea0 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 21 Aug 2024 13:20:18 -0600 Subject: [PATCH 187/243] Parse walltimes for slurm --- python/TestHarness/schedulers/RunPBS.py | 8 ++++---- python/TestHarness/schedulers/RunSlurm.py | 19 +++++++++++++++---- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 20e5e8d70674..82b82709edb9 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -8,7 +8,7 @@ #* https://www.gnu.org/licenses/lgpl-2.1.html import re, json -from datetime import datetime +import datetime from RunHPC import RunHPC from PBScodes import PBS_User_EXITCODES from TestHarness import util @@ -53,7 +53,7 @@ def parse_time(name): return None try: - return datetime.strptime(entry, '%a %b %d %H:%M:%S %Y').timestamp() + return datetime.datetime.strptime(entry, '%a %b %d %H:%M:%S %Y').timestamp() except: self.setHPCJobError(hpc_job, 'FAILED TO PARSE TIMING', f'Failed to parse time "{time}" from entry "{name}"') @@ -106,14 +106,14 @@ def parse_time(name): if resources_used: walltime = resources_used.get('walltime') if walltime: - search = re.search(f'^(\d{2}):(\d{2}):(\d{2})$', walltime) + search = re.search(r'^(\d+):(\d{2}):(\d{2})$', walltime) if search: walltime_sec = datetime.timedelta(hours=int(search.group(1)), minutes=int(search.group(2)), seconds=int(search.group(3))).total_seconds() else: self.setHPCJobError(hpc_job, 'WALLTIME PARSE ERROR', - f'Failed to parse walltime from {walltime}') + f'Failed to parse walltime from "{walltime}"') # Set end time if possible. PBS has an 'obittime' entry that can be used, diff --git a/python/TestHarness/schedulers/RunSlurm.py b/python/TestHarness/schedulers/RunSlurm.py index 6e17ec0da8cd..4acd0e65c80c 100644 --- a/python/TestHarness/schedulers/RunSlurm.py +++ b/python/TestHarness/schedulers/RunSlurm.py @@ -8,6 +8,7 @@ #* https://www.gnu.org/licenses/lgpl-2.1.html import re +from datetime import datetime from RunHPC import RunHPC ## This Class is responsible for maintaining an interface to the slurm scheduling syntax @@ -26,7 +27,7 @@ def updateHPCJobs(self, hpc_jobs): # Poll for all of the jobs within a single call active_job_ids = ','.join([x.id for x in hpc_jobs]) cmd = ['sacct', '-j', active_job_ids, '--parsable2', '--noheader', - '-o', 'jobid,exitcode,state,reason'] + '-o', 'jobid,exitcode,state,reason,start,end'] exit_code, result, _ = self.callHPC(self.CallHPCPoolType.status, ' '.join(cmd)) if exit_code != 0: return False @@ -44,10 +45,18 @@ def updateHPCJobs(self, hpc_jobs): # exit code of the process, the second is a slurm internal code statuses[id] = {'exitcode': int(status_split[1].split(':')[0]), 'state': status_split[2], - 'reason': status_split[3]} + 'reason': status_split[3], + 'start': status_split[4], + 'end': status_split[5]} # Update the jobs that we can for hpc_job in hpc_jobs: + # Helper for parsing a time + def parse_time(time): + if time: + return datetime.strptime(time, '%Y-%m-%dT%H:%M:%S').timestamp() + return None + # Slurm jobs are sometimes not immediately available status = statuses.get(hpc_job.id) if status is None: @@ -60,7 +69,8 @@ def updateHPCJobs(self, hpc_jobs): # Job wasn't running and it's no longer pending, so it # is running or has at least ran if state != 'PENDING' and hpc_job.state != hpc_job.State.running: - self.setHPCJobRunning(hpc_job) + start_time = parse_time(status['start']) + self.setHPCJobRunning(hpc_job, start_time) # Job was running and isn't running anymore, so it's done if hpc_job.state == hpc_job.State.running and state not in ['RUNNING', 'COMPLETING']: @@ -80,7 +90,8 @@ def updateHPCJobs(self, hpc_jobs): elif state not in ['FAILED', 'COMPLETED']: self.setHPCJobError(hpc_job, f'SLURM ERROR: {state}', f'has state "{state}"') - self.setHPCJobDone(hpc_job, exit_code) + end_time = parse_time(status['end']) + self.setHPCJobDone(hpc_job, exit_code, end_time) # Success return True From ac69bf3f3935a08cec2e9ae409accb9d260f7eca Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 21 Aug 2024 13:34:20 -0600 Subject: [PATCH 188/243] Correct for no entries --- python/TestHarness/schedulers/RunHPC.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 381ffd144a66..345b292c8edc 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -857,9 +857,10 @@ def additionalResultSummary(self): if timer.hasTotalTime(key): times[key].append(timer.totalTime(key)) - avg_queued = statistics.mean(times['hpc_queued']) - avg_wait_output = statistics.mean(times['hpc_wait_output']) + averages = {} + for key, values in times.items(): + averages[key] = statistics.mean(values) if values else 0 - result = f'Average queued time {avg_queued:.1f} seconds, ' - result += f'average output wait time {avg_wait_output:.1f} seconds.' + result = f'Average queued time {averages["hpc_queued"]:.1f} seconds, ' + result += f'average output wait time {averages["hpc_wait_output"]:.1f} seconds.' return result From 494fcab18159d983fd04ff3a01f464de6a167b2a Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 22 Aug 2024 11:06:30 -0600 Subject: [PATCH 189/243] Determine walltime and exit code internally --- .gitignore | 2 ++ python/TestHarness/runners/HPCRunner.py | 30 ++++++++++++++++++++-- python/TestHarness/schedulers/Job.py | 8 +++++- python/TestHarness/schedulers/RunHPC.py | 8 +++++- python/TestHarness/schedulers/hpc_template | 23 +++++++++++++---- 5 files changed, 62 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index daa5c91444c3..6d76766de75d 100644 --- a/.gitignore +++ b/.gitignore @@ -330,5 +330,7 @@ share/ # test harness hpc output pbs_*.qsub pbs_*.out +pbs_*.result slurm_*.sbatch slurm_*.out +slurm_*.result diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index 05cf0af4dd3e..38f34fb99ae7 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -7,7 +7,7 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html -import re, time, os, subprocess +import re, time, os, subprocess, yaml from TestHarness.runners.Runner import Runner from TestHarness import util @@ -56,6 +56,8 @@ def wait(self, timer): # The PBS output (stdout+stderr) output_file = self.run_hpc.getHPCJobOutputPath(self.job) + # The result file (exit code + walltime) + result_file = self.run_hpc.getHPCJobResultPath(self.job) # If the Job is already finished, something happened in the # HPC scheduler so we have an invalid state for processing @@ -78,7 +80,7 @@ def wait(self, timer): tester = self.job.getTester() # Determine the output files that we need to wait for to be complete - wait_files = set([output_file]) + wait_files = set([output_file, result_file]) # Output files needed by the Tester, only if it says we should if tester.mustOutputExist(self.exit_code): for file in tester.getOutputFiles(self.options): @@ -89,6 +91,7 @@ def wait(self, timer): # Wait for all of the files to be available timer.start('hpc_wait_output') waited_time = 0 + walltime = None while wait_files or incomplete_files: # Don't bother if we've been killed if self.hpc_job.isKilled(): @@ -108,6 +111,19 @@ def wait(self, timer): # It's now required because its complete if not self.trySetOutput(required=True): break + # Store the result + elif file == result_file: + with open(file, 'r') as f: + result = yaml.safe_load(f) + self.exit_code = result['exit_code'] + walltime = result['walltime'] + + # Delete this, we don't really need it to hang around + try: + os.remove(file) + except OSError: + pass + # Done with this file incomplete_files.discard(file) @@ -128,6 +144,16 @@ def print_files(files, type): time.sleep(self.file_completion_poll_time) timer.stop('hpc_wait_output') + # If we have a walltime from output, use it instead as it'll be + # more accurate for the real runtime + if walltime: + timer = self.job.timer + start_time = timer.startTime('runner_run') + end_time = start_time + walltime + timer.reset('runner_run') + timer.start('runner_run', start_time) + timer.stop('runner_run', end_time) + # Handle openmpi appending a null character at the end of jobs # that return a nonzero exit code. We don't know how to fix this # in openmpi yet, so this is the cleanest way to take care of it. diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 17d547855af5..07cc41a772f0 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -50,7 +50,6 @@ def stop(self, name: str, at_time=None): entry = self.times.get(name) if not entry: raise Exception(f'Missing time entry {name}') - if len(entry) > 1: raise Exception(f'Time entry {name} already stopped') entry.append(at_time) @@ -104,6 +103,13 @@ def startTime(self, name): raise Exception(f'Missing time entry {name}') return entry[0] + def reset(self, name: str): + """ Resets a given timer """ + with self.lock: + if name not in self.times: + raise Exception(f'Missing time entry {name}') + del self.times[name] + class TimeManager: """ Context manager for timing a section """ def __init__(self, timer, name: str): diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 345b292c8edc..7d5201d96276 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -310,6 +310,7 @@ def submitJob(self, job, hold): submission_script = self.getHPCJobSubmissionPath(job) output_file = self.getHPCJobOutputPath(job) + result_file = self.getHPCJobResultPath(job) # Clean these two files for file in [submission_script, output_file]: @@ -324,6 +325,7 @@ def submitJob(self, job, hold): 'NAME': self.getHPCJobName(job), 'CWD': tester.getTestDir(), 'OUTPUT': output_file, + 'RESULT': result_file, 'SUBMISSION_SCRIPT': submission_script, 'WALLTIME': str(datetime.timedelta(seconds=tester.getMaxTime())), 'PROJECT': self.options.hpc_project, @@ -402,7 +404,7 @@ def submitJob(self, job, hold): # The output files that we're expected to generate so that the # HPC job can add a terminator for them so that we can verify # they are complete on the executing host - additional_output = [] + additional_output = [result_file] for file in tester.getOutputFiles(options): additional_output.append(f'"{os.path.join(tester.getTestDir(), file)}"') submission_env['ADDITIONAL_OUTPUT_FILES'] = ' '.join(additional_output) @@ -801,6 +803,10 @@ def getHPCJobOutputPath(self, job): """Gets the absolute path for stdout/stderr for a HPC job""" return self.getHPCJobOutputPathPrefix(job) + '.out' + def getHPCJobResultPath(self, job): + """Gets the absolute path for the result (exit code, walltime) for a HPC job""" + return self.getHPCJobOutputPathPrefix(job) + '.result' + def getHPCJobSubmissionPath(self, job): """Gets the aboslute path for the qsub script for a HPC job""" return self.getHPCJobOutputPathPrefix(job) + f'.{self.getHPCSubmissionCommand()}' diff --git a/python/TestHarness/schedulers/hpc_template b/python/TestHarness/schedulers/hpc_template index daa44e5e8454..8b8879e6b58d 100644 --- a/python/TestHarness/schedulers/hpc_template +++ b/python/TestHarness/schedulers/hpc_template @@ -47,6 +47,7 @@ echo 'Directory: {{ CWD }}' echo 'Submitted hostname: {{ SUBMITTED_HOSTNAME }}' echo 'Submission script: {{ SUBMISSION_SCRIPT }}' echo 'Output: {{ OUTPUT }}' +echo 'Result: {{ RESULT }}' module list echo "################################################################################" @@ -55,14 +56,16 @@ echo "Beginning TestHarness {{ SCHEDULER_NAME }} test execution" # Move into the test directory cd {{ CWD }} +# Make a temp file to store the time +time_output=$(mktemp) + # Don't exit on failure: need to capture the actual run's return code set +e - -# Run the command -{{ COMMAND }} +# Run the command, wrapped in time so that we can capture the real runtime +# We use which here to make sure we don't get the bash function 'time' +$(which time) -f %e -o ${time_output} {{ COMMAND }} # ...and capture the return code cause we're not done yet return_code=$? - # Exit on failure set -e @@ -80,7 +83,17 @@ if ((return_code > 128)); then return_code=$new_return_code fi {%- endif %} -echo "Completed TestHarness {{ SCHEDULER_NAME }} test execution; exit code = $return_code" +# Load the execution time; we use a tail here because the process will +# include a comment about a non-zero status first if the exit code is nonzero +walltime=$(tail -1 ${time_output}) +rm ${time_output} +# Print the exit footer +echo "Completed TestHarness {{ SCHEDULER_NAME }} test execution; exit code = $return_code, walltime = $walltime sec" + +# Build the result file +touch {{ RESULT }} +echo "exit_code: $return_code" >> {{ RESULT }} +echo "walltime: $walltime" >> {{ RESULT }} # Append a terminator to all of the output files for file syncing across NFS ADDITIONAL_OUTPUT_FILES=({{ ADDITIONAL_OUTPUT_FILES }}) From b8a154e4ee190ae53a6d61bb203d306c307e6c43 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 22 Aug 2024 13:43:03 -0600 Subject: [PATCH 190/243] Also remove the result file --- python/TestHarness/schedulers/RunHPC.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 7d5201d96276..33629ed0fc17 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -312,8 +312,8 @@ def submitJob(self, job, hold): output_file = self.getHPCJobOutputPath(job) result_file = self.getHPCJobResultPath(job) - # Clean these two files - for file in [submission_script, output_file]: + # Remove these files if they exist + for file in [submission_script, output_file, result_file]: if os.path.exists(file): os.remove(file) From 15e8f60c95027025c4fdbcc4ae317633c44efcec Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 22 Aug 2024 13:43:15 -0600 Subject: [PATCH 191/243] Try even harder to get a walltime from PBS --- python/TestHarness/schedulers/RunPBS.py | 30 ++++++++++++------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 82b82709edb9..ea7de8abb0d8 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -100,10 +100,15 @@ def parse_time(name): elif exit_code >= 128: self.setHPCJobError(hpc_job, 'PBS JOB KILLED', 'was killed by a signal') - # Parse walltime if possible - walltime_sec = None + # Parse end time if possible. PBS is all over the place on this one. Sometimes + # walltime is available, sometimes it isn't. We also have obittime, but that + # time seems to be longer than the actual run. + end_time = None + # First try to get it from the walltime (sometimes this is 0...). We'll fake + # this a bit and just add the walltime to the start time + stime = parse_time('stime') resources_used = job_result.get('resources_used') - if resources_used: + if stime and resources_used: walltime = resources_used.get('walltime') if walltime: search = re.search(r'^(\d+):(\d{2}):(\d{2})$', walltime) @@ -111,21 +116,16 @@ def parse_time(name): walltime_sec = datetime.timedelta(hours=int(search.group(1)), minutes=int(search.group(2)), seconds=int(search.group(3))).total_seconds() + if walltime_sec != 0: + end_time = stime + walltime_sec else: self.setHPCJobError(hpc_job, 'WALLTIME PARSE ERROR', f'Failed to parse walltime from "{walltime}"') - - - # Set end time if possible. PBS has an 'obittime' entry that can be used, - # but it includes a ton of cleanup time that isn't representative of the - # actual time. Here, we'll cheat and just append to the start time the - # walltime to determine the end time, if possible. If not possible, it'll - # just use the time for now which is fine. - end_time = None - if walltime_sec: - start_time = parse_time('stime') - if start_time: - end_time = start_time + walltime_sec + # If we don't have it yet, use the obit time + if not end_time: + obittime = parse_time('obittime') + if obittime: + end_time = obittime self.setHPCJobDone(hpc_job, exit_code, end_time) From ef30dad39b0b9d365869151a8c45673a90cee626 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 22 Aug 2024 21:40:28 -0600 Subject: [PATCH 192/243] Change placement option to one more suitable for testing --- python/TestHarness/TestHarness.py | 2 +- python/TestHarness/schedulers/RunHPC.py | 2 +- python/TestHarness/testers/Tester.py | 10 ++++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 97eae3d634ba..158b96a448c2 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -1077,7 +1077,7 @@ def parseCLArgs(self, argv): hpcgroup.add_argument('--hpc-host', nargs='+', action='store', dest='hpc_host', metavar='', help='The host(s) to use for submitting HPC jobs') hpcgroup.add_argument('--hpc-pre-source', nargs=1, action="store", dest='hpc_pre_source', metavar='', help='Source specified file before launching HPC tests') hpcgroup.add_argument('--hpc-file-timeout', nargs=1, type=int, action='store', dest='hpc_file_timeout', default=300, help='The time in seconds to wait for HPC output') - hpcgroup.add_argument('--hpc-place', nargs=1, action='store', dest='hpc_place', choices=['free', 'pack', 'scatter'], default='free', help='The default placement method for HPC jobs') + hpcgroup.add_argument('--hpc-scatter-procs', nargs=1, type=int, action='store', dest='hpc_scatter_procs', default=None, help='Set to run HPC jobs with scatter placement when the processor count is this or lower') hpcgroup.add_argument('--hpc-apptainer-bindpath', nargs=1, action='store', type=str, dest='hpc_apptainer_bindpath', help='Sets the apptainer bindpath for HPC jobs') hpcgroup.add_argument('--hpc-apptainer-no-home', action='store_true', dest='hpc_apptainer_no_home', help='Passes --no-home to apptainer for HPC jobs') hpcgroup.add_argument('--hpc-project', nargs=1, action='store', dest='hpc_project', type=str, default='moose', metavar='', help='Identify your job(s) with this project (default: %(default)s)') diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 33629ed0fc17..38bb4dc3c099 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -337,7 +337,7 @@ def submitJob(self, job, hold): 'NUM_THREADS': int(tester.getThreads(options)), 'ENDING_COMMENT': self.getOutputEndingComment(f'${self.getHPCJobIDVariable()}'), 'JOB_ID_VARIABLE': self.getHPCJobIDVariable(), - 'PLACE': self.options.hpc_place} + 'PLACE': tester.getHPCPlace(options)} if hold: submission_env['HOLD'] = 1 if self.options.hpc_pre_source: diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index f86559886503..0d1a18002522 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -774,3 +774,13 @@ def run(self, options, exit_code, runner_output): self.output += '\n' + "#"*80 + '\nTester skipped, reason: ' + self.getStatusMessage() + '\n' elif self.isFail(): self.output += '\n' + "#"*80 + '\nTester failed, reason: ' + self.getStatusMessage() + '\n' + + def getHPCPlace(self, options): + """ + Return the placement to use for HPC jobs + """ + if options.hpc_scatter_procs: + procs = self.getProcs(options) + if procs > 1 and procs <= options.hpc_scatter_procs: + return 'scatter' + return 'free' From 03a452d79b53e8e627588585284f72c1eefbc7da Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Thu, 22 Aug 2024 21:40:50 -0600 Subject: [PATCH 193/243] Queued to queue --- python/TestHarness/schedulers/RunHPC.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 38bb4dc3c099..9955affa4504 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -867,6 +867,6 @@ def additionalResultSummary(self): for key, values in times.items(): averages[key] = statistics.mean(values) if values else 0 - result = f'Average queued time {averages["hpc_queued"]:.1f} seconds, ' + result = f'Average queue time {averages["hpc_queued"]:.1f} seconds, ' result += f'average output wait time {averages["hpc_wait_output"]:.1f} seconds.' return result From f193bd1631a13ae36fd446d924c4bdd989bb7b92 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 23 Aug 2024 09:55:54 -0600 Subject: [PATCH 194/243] Add better comment about redirecting output --- python/TestHarness/testers/RunApp.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/TestHarness/testers/RunApp.py b/python/TestHarness/testers/RunApp.py index fd071a3d92ec..86fb28182be3 100644 --- a/python/TestHarness/testers/RunApp.py +++ b/python/TestHarness/testers/RunApp.py @@ -124,9 +124,9 @@ def checkRunnable(self, options): self.setStatus(self.skip) return False - # We have non-deterministic issues when running with the HPC python wrapper - # and using --redirected-output. If the user explicitly requested more - # parallel procs, we can't run this + # Finalizing output using the current method in the submission script from the rank 0 process isn't + # really a good idea when output might exist on a different node. We could make that finalization + # more complex, but there isn't a need at the moment. if options.hpc and self.specs['redirect_output'] == True and int(self.specs['min_parallel']) > 1: self.addCaveats('hpc min_cpus=1') self.setStatus(self.skip) @@ -168,9 +168,9 @@ def getProcs(self, options): # Lower the ceiling ncpus = min(ncpus, int(self.specs['max_parallel'])) - # We have non-deterministic issues when running with the HPC python wrapper - # and using --redirected-output. Here, if the user didn't explicitly request - # to use more parallel procs, we'll limit it to 1 + # Finalizing output using the current method in the submission script from the rank 0 process isn't + # really a good idea when output might exist on a different node. We could make that finalization + # more complex, but there isn't a need at the moment. if options.hpc and self.specs['redirect_output'] == True and min_parallel == 1 and ncpus > 1: self.addCaveats('hpc min_cpus=1') return 1 From 43205fd331fb48d34b93b0b6cf93512640ad4d64 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 23 Aug 2024 11:37:40 -0600 Subject: [PATCH 195/243] Get the correct exception --- python/TestHarness/schedulers/Scheduler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index f3b6fe460d97..78a19545e9ec 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -510,8 +510,9 @@ def runJob(self, job, jobs): self.run(job) # Hand execution over to derived scheduler except Exception: with job.getLock(): + trace = traceback.format_exc() job.setStatus(StatusSystem().error, 'JOB RUN EXCEPTION') - job.appendOutput('Encountered an exception while running Job: %s' % (traceback.format_exc())) + job.appendOutput(f'Encountered an exception while running Job:\n{trace}') if timeout_timer: timeout_timer.cancel() From d0ca74abd9613dc087366dfb60668bd6a90f630d Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 23 Aug 2024 15:05:23 -0600 Subject: [PATCH 196/243] Rework --sep-files - Removes the --sep-files-[ok,failed] options and combines them together - Gets --sep-files working with HPC --- .gitignore | 14 +- python/TestHarness/OutputInterface.py | 69 +++++++ python/TestHarness/TestHarness.py | 99 +++------ python/TestHarness/__init__.py | 1 + python/TestHarness/runners/HPCRunner.py | 35 ++-- python/TestHarness/runners/Runner.py | 32 ++- .../TestHarness/runners/SubprocessRunner.py | 14 +- python/TestHarness/schedulers/Job.py | 189 ++++++++++++------ python/TestHarness/schedulers/RunHPC.py | 13 +- python/TestHarness/schedulers/RunParallel.py | 15 +- python/TestHarness/schedulers/Scheduler.py | 2 - python/TestHarness/testers/Tester.py | 25 +-- python/TestHarness/util.py | 2 +- 13 files changed, 290 insertions(+), 220 deletions(-) create mode 100644 python/TestHarness/OutputInterface.py diff --git a/.gitignore b/.gitignore index 6d76766de75d..67ab743aa284 100644 --- a/.gitignore +++ b/.gitignore @@ -327,10 +327,12 @@ share/ /tutorials/tutorial01_app_development/*/babbler.yaml /tutorials/darcy_thermo_mech/*/darcy_thermo_mech.yaml +# test harness --sep-files output +*.runner_out.txt +*.tester_out.txt +*.job_out.txt + # test harness hpc output -pbs_*.qsub -pbs_*.out -pbs_*.result -slurm_*.sbatch -slurm_*.out -slurm_*.result +*.hpc_out.txt +*.hpc_result +*.hpc_submit diff --git a/python/TestHarness/OutputInterface.py b/python/TestHarness/OutputInterface.py new file mode 100644 index 000000000000..18b6111d3507 --- /dev/null +++ b/python/TestHarness/OutputInterface.py @@ -0,0 +1,69 @@ +#* This file is part of the MOOSE framework +#* https://www.mooseframework.org +#* +#* All rights reserved, see COPYRIGHT for full restrictions +#* https://github.com/idaholab/moose/blob/master/COPYRIGHT +#* +#* Licensed under LGPL 2.1, please see LICENSE for details +#* https://www.gnu.org/licenses/lgpl-2.1.html + +import os + +class OutputInterface: + """ Helper class for writing output to either memory or a file """ + def __init__(self): + # The in-memory output, if any + self.output = '' + # The path to write output to, if any + self.separate_output_path = None + + def setSeparateOutputPath(self, separate_output_path): + """ Sets the path for writing output to """ + self.separate_output_path = separate_output_path + + # If we have any dangling output, write it + if self.output: + self.setOutput(self.output) + self.output = '' + + def getSeparateOutputFilePath(self): + """ Gets the path that this output is writing to, if any """ + return self.separate_output_path + + def hasOutput(self): + """ Whether or not this object has any content written """ + if self.separate_output_path: + return os.path.isfile(self.separate_output_path) + return len(self.output) + + def getOutput(self): + """ Gets the underlying output, either from file or memory """ + if self.separate_output_path: + try: + return open(self.separate_output_path, 'r').read() + except FileNotFoundError: + pass + else: + return self.output + return '' + + def setOutput(self, output): + """ Sets the output given some output string """ + if self.separate_output_path: + open(self.separate_output_path, 'w').write(output) + else: + self.output = output + + def appendOutput(self, output): + """ Appends to the output """ + if self.separate_output_path: + open(self.separate_output_path, 'a').write(output) + else: + self.output += output + + def clearOutput(self): + """ Clears the output """ + if self.separate_output_path: + open(self.separate_output_path, 'w').close() + else: + self.output = '' diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 158b96a448c2..ece5f0f967b9 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -799,10 +799,8 @@ def writeResults(self): # Record the input file name that was used self.options.results_storage['INPUT_FILE_NAME'] = self.options.input_file_name - # Record that we are using --sep-files* options - self.options.results_storage['SEP_FILES'] = (self.options.ok_files - or self.options.fail_files - or self.options.sep_files) + # Record that we are using --sep-files + self.options.results_storage['SEP_FILES'] = self.options.sep_files # Record the Scheduler Plugin used self.options.results_storage['SCHEDULER'] = self.scheduler.__class__.__name__ @@ -812,31 +810,35 @@ def writeResults(self): for job in job_group: status, message, message_color, status_code, sort_value = job.getJointStatus() + if status == 'SILENT': + continue + # Create empty key based on TestDir, or re-inialize with existing data so we can append to it self.options.results_storage[job.getTestDir()] = self.options.results_storage.get(job.getTestDir(), {}) - # If output has been stored in separate files, don't make additional copies by - # storing that data in this json results file (--sep-files, etc options). - output = '' if job.getOutputFile() else job.getOutput() - - self.options.results_storage[job.getTestDir()][job.getTestName()] = {'NAME' : job.getTestNameShort(), - 'LONG_NAME' : job.getTestName(), - 'TIMING' : job.getTiming(), - 'STATUS' : status, - 'STATUS_MESSAGE' : message, - 'FAIL' : job.isFail(), - 'COLOR' : message_color, - 'CAVEATS' : list(job.getCaveats()), - 'OUTPUT' : output, - 'COMMAND' : job.getCommand(), - 'META_DATA' : job.getMetaData()} + # Output that isn't in a file (no --sep-files) + output = job.getCombinedOutput() if not self.options.sep_files else None + # Output that is in a file (--sep-files) + output_files = job.getCombinedSeparateOutputPaths() if self.options.sep_files else None + + self.options.results_storage[job.getTestDir()][job.getTestName()] = {'NAME' : job.getTestNameShort(), + 'LONG_NAME' : job.getTestName(), + 'TIMING' : job.timer.totalTimes(), + 'STATUS' : status, + 'STATUS_MESSAGE' : message, + 'FAIL' : job.isFail(), + 'COLOR' : message_color, + 'CAVEATS' : list(job.getCaveats()), + 'OUTPUT' : output, + 'OUTPUT_FILES' : output_files, + 'TESTER_OUTPUT_FILES' : job.getOutputFiles(self.options), + 'INPUT_FILE' : job.getInputFile(), + 'COMMAND' : job.getCommand(), + 'META_DATA' : job.getMetaData()} # Additional data to store (overwrites any previous matching keys) self.options.results_storage[job.getTestDir()].update(job.getMetaData()) - if self.options.output_dir: - self.options.results_file = os.path.join(self.options.output_dir, self.options.results_file) - if self.options.results_storage and self.options.results_file: try: with open(self.options.results_file, 'w') as data_file: @@ -856,7 +858,7 @@ def writeResults(self): try: # Write one file, with verbose information (--file) if self.options.file: - with open(os.path.join(self.output_dir, self.options.file), 'w') as f: + with open(self.options.file, 'w') as f: for job_group in all_jobs: for job in job_group: # Do not write information about silent tests @@ -866,33 +868,6 @@ def writeResults(self): formated_results = util.formatResult( job, self.options, result=job.getOutput(), color=False) f.write(formated_results + '\n') - # Write a separate file for each test with verbose information (--sep-files, --sep-files-ok, --sep-files-fail) - if ((self.options.ok_files and self.num_passed) - or (self.options.fail_files and self.num_failed)): - for job_group in all_jobs: - for job in job_group: - status, message, message_color, status_code, sort_value = job.getJointStatus() - - if self.options.output_dir: - output_dir = self.options.output_dir - else: - output_dir = job.getTestDir() - - output = '' - # Append input file contents to output - if self.options.include_input: - # This is a file i/o operation. We only want to do this once, and only if necessary - input_file = job.getInputFileContents() - if input_file: - output += "\n\nINPUT FILE:\n" + str(input_file) - - output += "\n\nTEST OUTPUT:" + job.getOutput() - output_file = job.getOutputFile() - formated_results = util.formatResult(job, self.options, result=output, color=False) - if output_file: - with open(output_file, 'w') as f: - f.write(formated_results) - except IOError: print('Permission error while writing results to disc') sys.exit(1) @@ -953,17 +928,6 @@ def initialize(self, argv, app_name): mydir = os.path.dirname(os.path.realpath(__file__)) self.executable = os.path.join(mydir, '../../../..', 'bin', self.executable) - # Save the output dir since the current working directory changes during tests - self.output_dir = os.path.join(os.path.abspath(os.path.dirname(sys.argv[0])), self.options.output_dir) - - # Create the output dir if they ask for it. It is easier to ask for forgiveness than permission - if self.options.output_dir: - try: - os.makedirs(self.output_dir) - except OSError as ex: - if ex.errno == errno.EEXIST: pass - else: raise - # Use a previous results file, or declare the variable self.options.results_storage = {} if self.useExistingStorage(): @@ -1056,11 +1020,8 @@ def parseCLArgs(self, argv): outputgroup.add_argument('-q', '--quiet', action='store_true', dest='quiet', help='only show the result of every test, don\'t show test output even if it fails') outputgroup.add_argument('--no-report', action='store_false', dest='report_skipped', help='do not report skipped tests') outputgroup.add_argument('--show-directory', action='store_true', dest='show_directory', help='Print test directory path in out messages') - outputgroup.add_argument('-o', '--output-dir', nargs=1, metavar='directory', dest='output_dir', default='', help='Save all output files in the directory, and create it if necessary') outputgroup.add_argument('-f', '--file', nargs=1, action='store', dest='file', help='Write verbose output of each test to FILE and quiet output to terminal') - outputgroup.add_argument('-x', '--sep-files', action='store_true', dest='sep_files', help='Write the output of each test to a separate file. Only quiet output to terminal. This is equivalant to \'--sep-files-fail --sep-files-ok\'') - outputgroup.add_argument('--sep-files-ok', action='store_true', dest='ok_files', help='Write the output of each passed test to a separate file') - outputgroup.add_argument('-a', '--sep-files-fail', action='store_true', dest='fail_files', help='Write the output of each FAILED test to a separate file. Only quiet output to terminal.') + outputgroup.add_argument('-x', '--sep-files', action='store_true', dest='sep_files', help='Write the output of each test to a separate file. Only quiet output to terminal.') outputgroup.add_argument('--include-input-file', action='store_true', dest='include_input', help='Include the contents of the input file when writing the results of a test to a file') outputgroup.add_argument("--testharness-unittest", action="store_true", help="Run the TestHarness unittests that test the TestHarness.") outputgroup.add_argument("--json", action="store_true", dest="json", help="Dump the parameters for the testers in JSON Format") @@ -1132,8 +1093,6 @@ def parseCLArgs(self, argv): # Exit if options don't make any sense, print warnings if they are merely weird def checkAndUpdateCLArgs(self): opts = self.options - if opts.output_dir and not (opts.file or opts.sep_files or opts.fail_files or opts.ok_files): - print('WARNING: --output-dir is specified but no output files will be saved, use -f or a --sep-files option') if opts.group == opts.not_group: print('ERROR: The group and not_group options cannot specify the same group') sys.exit(1) @@ -1172,12 +1131,6 @@ def checkAndUpdateCLArgs(self): # User wants to write all output, so unify the options involved if opts.sep_files: - opts.ok_files = True - opts.fail_files = True - opts.quiet = True - - # User wants only failed files, so unify the options involved - elif opts.fail_files: opts.quiet = True def postRun(self, specs, timing): diff --git a/python/TestHarness/__init__.py b/python/TestHarness/__init__.py index 0c9708c685c5..59cc7b489648 100644 --- a/python/TestHarness/__init__.py +++ b/python/TestHarness/__init__.py @@ -13,5 +13,6 @@ sys.exit(1) from .TestHarness import TestHarness +from .OutputInterface import OutputInterface from .TestHarness import findDepApps __all__=['TestHarness', 'findDepApps'] diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index 38f34fb99ae7..160e353c66e7 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -83,8 +83,7 @@ def wait(self, timer): wait_files = set([output_file, result_file]) # Output files needed by the Tester, only if it says we should if tester.mustOutputExist(self.exit_code): - for file in tester.getOutputFiles(self.options): - wait_files.add(os.path.join(tester.getTestDir(), file)) + wait_files.update(self.job.getOutputFiles(self.options)) # The files that we can read, but are incomplete (no terminator) incomplete_files = set() @@ -134,8 +133,8 @@ def wait(self, timer): self.trySetOutput() def print_files(files, type): if files: - self.output += util.outputHeader(f'{type} output file(s):', ending=False) - self.output += '\n'.join(files) + '\n' + self.appendOutput(util.outputHeader(f'{type} output file(s):', ending=False)) + self.appendOutput('\n'.join(files) + '\n') print_files(wait_files, 'Unavailable') print_files(incomplete_files, 'Incomplete') break @@ -161,14 +160,16 @@ def print_files(files, type): # apptainer execution within hpc_template. This allows the null # character check that happens in Runner.finalize() to still # be valid. - if self.exit_code != 0 and self.job.getTester().hasOpenMPI() and self.output: - prefix = '\n' - null = '\0' - suffix = '##########' - all = f'{prefix}{null}{suffix}' - no_null = f'{prefix}{suffix}' - if all in self.output: - self.output = self.output.replace(all, no_null) + if self.exit_code != 0 and self.job.getTester().hasOpenMPI(): + output = self.getOutput() + if output: + prefix = '\n' + null = '\0' + suffix = '##########' + all = f'{prefix}{null}{suffix}' + no_null = f'{prefix}{suffix}' + if all in output: + self.setOutput(output.replace(all, no_null)) def kill(self): if self.hpc_job: @@ -189,15 +190,13 @@ def trySetOutput(self, required=False): if os.path.exists(output_file) and os.path.isfile(output_file): try: header = f'{self.run_hpc.getHPCSchedulerName()} job {self.hpc_job.id} output' - self.output = util.outputHeader(f'Begin {header}') # If we're trying to parse output, we can't truncate it # because it might appear in the truncated portion - if self.job.getTester().needFullOutput(self.options): - self.output += open(output_file, 'r').read() + if self.job.getTester().needFullOutput(self.options) or self.job.hasSeperateOutput(): + self.setOutput(open(output_file, 'r').read()) # Not parsing the output, so just read it truncated else: - self.output += self.readTruncated(output_file) - self.output += util.outputHeader(f'End {header}') + self.setOutput(self.readTruncated(output_file)) did_set = True except: @@ -206,7 +205,7 @@ def trySetOutput(self, required=False): if did_set: self.output_completed = True else: - self.output = f'Failed to load output file {output_file}\n' + self.setOutput(f'Failed to load output file {output_file}\n') if required: self.job.setStatus(self.job.error, 'FAILED OUTPUT READ') diff --git a/python/TestHarness/runners/Runner.py b/python/TestHarness/runners/Runner.py index 7d54b954fa9e..970481481a37 100644 --- a/python/TestHarness/runners/Runner.py +++ b/python/TestHarness/runners/Runner.py @@ -8,9 +8,9 @@ #* https://www.gnu.org/licenses/lgpl-2.1.html import os, json -from TestHarness import util +from TestHarness import OutputInterface -class Runner: +class Runner(OutputInterface): """ Base class for running a process via a command. @@ -19,14 +19,14 @@ class Runner: or externally (i.e., PBS, slurm, etc on HPC) """ def __init__(self, job, options): + OutputInterface.__init__(self) + # The job that this runner is for self.job = job # The test harness options self.options = options # The job's exit code, should be set after wait() self.exit_code = None - # The output the job produced; to be filled in wait() - self.output = '' def spawn(self, timer): """ @@ -60,37 +60,33 @@ def finalize(self): """ # Load the redirected output files, if any for file_path in self.job.getTester().getRedirectedOutputFiles(self.options): - self.output += util.outputHeader(f'Begin redirected output {file_path}') + self.appendOutput(util.outputHeader(f'Begin redirected output {file_path}')) if os.access(file_path, os.R_OK): with open(file_path, 'r+b') as f: - self.output += self.readOutput(f) + self.appendOutput(self.readOutput(f)) else: self.job.setStatus(self.job.error, 'FILE TIMEOUT') - self.output += 'FILE UNAVAILABLE\n' - self.output += util.outputHeader(f'End redirected output {file_path}') + self.appendOutput('FILE UNAVAILABLE\n') + self.appendOutput(util.outputHeader(f'End redirected output {file_path}')) # Check for invalid unicode in output + output = self.getOutput() try: - json.dumps(self.output) + json.dumps(output) except UnicodeDecodeError: # Convert invalid output to something json can handle - self.output = self.output.decode('utf-8','replace').encode('ascii', 'replace') + self.setOutput(output.decode('utf-8','replace').encode('ascii', 'replace')) # Alert the user that output has invalid characters self.job.addCaveats('invalid characters in output') # Remove NULL output and fail if it exists + output = self.getOutput() null_chars = ['\0', '\x00'] for null_char in null_chars: - if null_char in self.output: - self.output = self.output.replace(null_char, 'NULL') + if null_char in output: + self.setOutput(output.replace(null_char, 'NULL')) self.job.setStatus(self.job.error, 'NULL characters in output') - def getOutput(self): - """ - Gets the combined output of the process. - """ - return self.output - def getExitCode(self): """ Gets the error code of the process. diff --git a/python/TestHarness/runners/SubprocessRunner.py b/python/TestHarness/runners/SubprocessRunner.py index 716705a367d5..3e8e9e54c915 100644 --- a/python/TestHarness/runners/SubprocessRunner.py +++ b/python/TestHarness/runners/SubprocessRunner.py @@ -79,25 +79,23 @@ def wait(self, timer): self.exit_code = self.process.poll() - # Header for the combined output - self.output = util.outputHeader('Begin combined stdout+stderr output') + self.clearOutput() # Load combined output for file in [self.outfile, self.errfile]: file.flush() - self.output += self.readOutput(file) + file_output = self.readOutput(file) file.close() # For some reason openmpi will append a null character at the end # when the exit code is nonzero. Not sure why this is... but remove # it until we figure out what's broken if file == self.errfile and self.exit_code != 0 \ - and self.job.getTester().hasOpenMPI() and len(self.output) > 2 \ - and self.output[-3:] == '\n\0\n': - self.output = self.output[0:-2] + and self.job.getTester().hasOpenMPI() and len(file_output) > 2 \ + and file_output == '\n\0\n': + file_output = file_output - # Footer for the combined output - self.output += util.outputHeader('End combined stderr+stdout output') + self.appendOutput(file_output) def kill(self): if self.process is not None: diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 07cc41a772f0..792e864e7836 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -12,7 +12,7 @@ from TestHarness.StatusSystem import StatusSystem from TestHarness.FileChecker import FileChecker from TestHarness.runners.Runner import Runner -from TestHarness import util +from TestHarness import OutputInterface, util from tempfile import TemporaryDirectory from collections import namedtuple import traceback @@ -103,12 +103,15 @@ def startTime(self, name): raise Exception(f'Missing time entry {name}') return entry[0] - def reset(self, name: str): - """ Resets a given timer """ + def reset(self, name = None): + """ Resets a given timer or all timers """ with self.lock: - if name not in self.times: - raise Exception(f'Missing time entry {name}') - del self.times[name] + if name: + if name not in self.times: + raise Exception(f'Missing time entry {name}') + del self.times[name] + else: + self.times.clear() class TimeManager: """ Context manager for timing a section """ @@ -124,7 +127,7 @@ def time(self, name: str): """ Time a section using a context manager """ return self.TimeManager(self, name) -class Job(object): +class Job(OutputInterface): """ The Job class is a simple container for the tester and its associated output file object, the DAG, the process object, the exit codes, and the start and end times. @@ -133,6 +136,8 @@ class Job(object): id_iter = itertools.count() def __init__(self, tester, job_dag, options): + OutputInterface.__init__(self) + self.id = next(self.id_iter) self.options = options self.__j_lock = threading.Lock() @@ -140,7 +145,6 @@ def __init__(self, tester, job_dag, options): self.specs = tester.specs self.__job_dag = job_dag self.timer = Timer() - self.__previous_time = None self.__joined_out = '' self.report_timer = None self.__slots = None @@ -189,11 +193,6 @@ def __init__(self, tester, job_dag, options): # The object that'll actually do the run self._runner = None - # Any additional output produced by the Job (not from the Tester or Runner) - self.output = '' - - self.cached_output = None - # A temp directory for this Job, if requested self.tmp_dir = None @@ -300,8 +299,11 @@ def getRunnable(self): return self.__tester.getRunnable(self.options) def getOutputFiles(self, options): - """ Wrapper method to return getOutputFiles """ - return self.__tester.getOutputFiles(options) + """ Wrapper method to return getOutputFiles (absolute path) """ + files = [] + for file in self.__tester.getOutputFiles(options): + files.append(os.path.join(self.__tester.getTestDir(), file)) + return files def getMaxTime(self): """ Wrapper method to return getMaxTime """ @@ -309,7 +311,10 @@ def getMaxTime(self): def getInputFile(self): """ Wrapper method to return input filename """ - return self.__tester.getInputFile() + input_file = self.__tester.getInputFile() + if input_file: + return os.path.join(self.getTestDir(), input_file) + return None def getInputFileContents(self): """ Wrapper method to return input file contents """ @@ -358,11 +363,18 @@ def run(self): """ tester = self.__tester + # Set the output path if its separate and initialize the output + if self.hasSeperateOutput(): + for name, object in self.getOutputObjects().items(): + output_path = self.getOutputPathPrefix() + f'.{name}_out.txt' + object.setSeparateOutputPath(output_path) + object.clearOutput() + # Start the main timer for running self.timer.startMain() # Helper for exiting - def cleanup(): + def finalize(): with self.timer.time('job_cleanup'): self.cleanup() self.timer.stopMain() @@ -376,12 +388,11 @@ def try_catch(do, exception_name, timer_name): except: trace = traceback.format_exc() self.setStatus(self.error, f'{exception_name} EXCEPTION') - self.output += util.outputHeader('Python exception encountered') - self.output += trace + self.appendOutput(util.outputHeader('Python exception encountered') + trace) failed = True if failed: - cleanup() + finalize() return not failed # Do not execute app, but still run the tester @@ -405,12 +416,12 @@ def try_catch(do, exception_name, timer_name): # Verify that the working directory is available right before we execute if not os.path.exists(tester.getTestDir()): self.setStatus(self.error, 'WORKING DIRECTORY NOT FOUND') - cleanup() + finalize() return # Getting the command can also cause a failure, so try that tester.getCommand(self.options) if tester.isError(): - cleanup() + finalize() return # Spawn the process @@ -431,7 +442,7 @@ def try_catch(do, exception_name, timer_name): # Job error occurred, which means the Runner didn't complete # so don't process anything else if self.isError(): - cleanup() + finalize() return # And do finalize (really just cleans up output) @@ -452,8 +463,8 @@ def try_catch(do, exception_name, timer_name): run_tester = lambda: tester.run(self.options, exit_code, runner_output) try_catch(run_tester, 'TESTER RUN', 'tester_run') - # Run cleanup now that we're done - cleanup() + # Run finalize now that we're done + finalize() def killProcess(self): """ Kill remaining process that may be running """ @@ -464,55 +475,104 @@ def killProcess(self): pass self.cleanup() - def getOutput(self): - """ Return the combined contents of output """ - # Cached output is used when reading from a results file, - # when we don't run anything and just populate results - if self.cached_output: - return self.cached_output - - # Concatenate output in order of Runner, Tester, Job - output = '' - object_outputs = [self.getRunner().getOutput() if self.getRunner() else '', - self.getTester().getOutput() if self.getTester() else '', - self.output] - for object_output in object_outputs: + def getOutputObjects(self) -> dict: + """ + Get a dict of all of the objects that contribute to output + + The key is a name which is a human readable name of the object + """ + objects = {} + if self.getRunner(): + objects['runner'] = self.getRunner() + objects['tester'] = self.getTester() + objects['job'] = self + return objects + + def getCombinedSeparateOutputPaths(self): + """ + Gets a dict of all of the --sep-files file paths that were produced + + The key is a name which is a human readable name of the object + """ + paths = {} + for name, object in self.getOutputObjects().items(): + paths[name] = object.getSeparateOutputFilePath() if object.hasOutput() else None + return paths + + def getCombinedOutput(self, concatenate=False): + """ Return individual output from each object """ + output = '' if concatenate else {} + for name, object in self.getOutputObjects().items(): + object_output = object.getOutput() if object_output: - # Append an extra line if we're missing one - if output and output[-1] != '\n': - output += '\n' - output += object_output + wrapped_object_output = '' + + if concatenate: + # Add a complete line break between objects + if output: + wrapped_object_output += '\n' + # Add a header before the output starts + wrapped_object_output += util.outputHeader(f'Begin {name} output', ending=False) + '\n' + + # Add the actual output + wrapped_object_output += object_output + + if concatenate: + # Add a newline if one is missing + if wrapped_object_output[-1] != '\n': + wrapped_object_output += '\n' + # Add a footer after the output ends + wrapped_object_output += util.outputHeader(f'End {name} output', ending=False) + + output += wrapped_object_output + else: + output[name] = wrapped_object_output + elif not concatenate: + output[name] = '' return output + def setPreviousOutputs(self, outputs): + """ Sets outputs from a previous run of this Job """ + for name, object in self.getOutputObjects().items(): + object.setOutput(outputs[name]) + + def setPreviousSeparateOutputs(self, output_paths): + """ Sets --sep-files outputs from a previous run of this Job """ + for name, object in self.getOutputObjects().items(): + object.setSeparateOutputPath(output_paths[name]) + def getRunner(self): """ Gets the Runner that actually runs the command """ return self._runner - def getOutputFile(self): - """ Return the output file path """ - if ((self.options.ok_files - or self.options.fail_files - or self.options.sep_files) - and (self.isPass() or self.isFail())): - (status, message, color, exit_code, sort_value) = self.getJointStatus() - output_dir = self.options.output_dir if self.options.output_dir else self.getTestDir() - output_file = os.path.join(output_dir, - '.'.join([os.path.basename(self.getTestDir()), - self.getTestNameShort().replace(os.sep, '.'), - status, - 'txt'])) - return os.path.join(output_dir, output_file) - - def appendOutput(self, output): - self.output += output - - def setPreviousTime(self, t): + def getOutputPathPrefix(self): + """ + Returns a file prefix that is unique to this job + + Should be used for all TestHarness produced files for this job + """ + return os.path.join(self.getTestDir(), self.getTestNameShort().replace(os.sep, '.')) + + def hasSeperateOutput(self): + """ + Whether or not this job has separate output. + + That is, whether or not we should pipe output to a file + """ + return self.options.sep_files + + + def setPreviousTimer(self, timer_dict): """ - Allow an arbitrary time to be set. This is used by the QueueManager + Allow arbitrary timer times to be set. This is used by the QueueManager to set the time as recorded by a previous TestHarness instance. """ - self.__previous_time = t + self.timer.reset() + time_now = Timer.time_now() + for name, total_time in timer_dict.items(): + self.timer.start(name, time_now) + self.timer.stop(name, time_now + total_time) def getTiming(self): """ Return active time if available, if not return a comparison of start and end time """ @@ -522,9 +582,6 @@ def getTiming(self): # Job has started if self.timer.hasTime('main'): return self.timer.totalTime() - # Previous time is set - if self.__previous_time: - return self.__previous_time return 0.0 def getStatus(self): diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 9955affa4504..9cfd7d2ce6c4 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -16,7 +16,7 @@ import copy import statistics from multiprocessing.pool import ThreadPool -from TestHarness import util +from TestHarness import OutputInterface, util class HPCJob: # The valid job states for a HPC job @@ -794,22 +794,17 @@ def getHPCJobName(job) -> str: """ return job.getTestName().replace(':', '.').replace('/', '.') - def getHPCJobOutputPathPrefix(self, job): - """Gets the absolute path prefix for a HPC job""" - scheduler_name = self.getHPCSchedulerName() - return os.path.join(job.getTestDir(), f"{scheduler_name}_" + job.getTestNameShort().replace('/', '.')) - def getHPCJobOutputPath(self, job): """Gets the absolute path for stdout/stderr for a HPC job""" - return self.getHPCJobOutputPathPrefix(job) + '.out' + return job.getOutputPathPrefix() + '.hpc_out.txt' def getHPCJobResultPath(self, job): """Gets the absolute path for the result (exit code, walltime) for a HPC job""" - return self.getHPCJobOutputPathPrefix(job) + '.result' + return job.getOutputPathPrefix() + '.hpc_result' def getHPCJobSubmissionPath(self, job): """Gets the aboslute path for the qsub script for a HPC job""" - return self.getHPCJobOutputPathPrefix(job) + f'.{self.getHPCSubmissionCommand()}' + return job.getOutputPathPrefix() + '.hpc_submit' @staticmethod def getOutputEndingComment(job_id) -> str: diff --git a/python/TestHarness/schedulers/RunParallel.py b/python/TestHarness/schedulers/RunParallel.py index 0ee44473a9cf..e3837c2a1478 100644 --- a/python/TestHarness/schedulers/RunParallel.py +++ b/python/TestHarness/schedulers/RunParallel.py @@ -45,8 +45,11 @@ def run(self, job): tester.setStatus(status, message) if caveats: tester.addCaveats(caveats) - job.setPreviousTime(job_results['TIMING']) - job.cached_output = job_results['OUTPUT'] + job.setPreviousTimer(job_results['TIMING']) + if self.options.results_storage['SEP_FILES']: + job.setPreviousSeparateOutputs(job_results['OUTPUT_FILES']) + else: + job.setPreviousOutputs(job_results['OUTPUT']) return # Anything that throws while running or processing a job should be caught @@ -63,8 +66,12 @@ def run(self, job): job.appendOutput(traceback.format_exc()) job.setStatus(job.error, 'JOB EXCEPTION') - if job.getOutputFile(): - job.addMetaData(DIRTY_FILES=[job.getOutputFile()]) + if job.hasSeperateOutput(): + dirty_files = [] + for object in job.getOutputObjects().values(): + if object.hasOutput(): + dirty_files.append(object.getSeparateOutputFilePath()) + job.addMetaData(DIRTY_FILES=dirty_files) def buildRunner(self, job, options) -> Runner: """Builds the runner for a given tester diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index 78a19545e9ec..2c5d430adc4f 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -423,8 +423,6 @@ def jobStatus(self, job, caveats): if clock() - self.last_reported_time >= self.min_report_time: # prevent 'finished' caveat with options expecting to take lengthy amounts of time if (not self.options.sep_files - and not self.options.ok_files - and not self.options.fail_files and not self.options.hpc and not self.options.heavy_tests and not self.options.valgrind_mode): diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index 0d1a18002522..52935b0da8f3 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -9,12 +9,12 @@ import re, os, sys, shutil import mooseutils -from TestHarness import util +from TestHarness import OutputInterface, util from TestHarness.StatusSystem import StatusSystem from FactorySystem.MooseObject import MooseObject from pathlib import Path -class Tester(MooseObject): +class Tester(MooseObject, OutputInterface): """ Base class from which all tester objects are instanced. """ @@ -122,6 +122,8 @@ def validParams(): def __init__(self, name, params): MooseObject.__init__(self, name, params) + OutputInterface.__init__(self) + self.specs = params self.joined_out = '' self.process = None @@ -164,13 +166,6 @@ def __init__(self, name, params): # depending on the runner which might inject something self.command_ran = None - # The tester output - self.output = '' - - def getOutput(self) -> str: - """Return the Tester output""" - return self.output - def getStatus(self): return self.test_status.getStatus() @@ -705,9 +700,7 @@ def checkRunnableBase(self, options): self.setStatus(self.fail, 'ABSOLUTE PATH DETECTED') # We can't offer the option of reading output files outside of initial TestDir - if self.specs['working_directory'] and (options.ok_files - or options.fail_files - or options.sep_files): + if self.specs['working_directory'] and options.sep_files: reasons['working_directory'] = '--sep-files* enabled' # Explicitly skip HPC tests @@ -767,13 +760,15 @@ def needFullOutput(self, options): return False def run(self, options, exit_code, runner_output): - self.output = self.processResults(self.getMooseDir(), options, exit_code, runner_output) + output = self.processResults(self.getMooseDir(), options, exit_code, runner_output) # If the tester requested to be skipped at the last minute, report that. if self.isSkip(): - self.output += '\n' + "#"*80 + '\nTester skipped, reason: ' + self.getStatusMessage() + '\n' + output += f'\nTester skipped, reason: {self.getStatusMessage()}\n' elif self.isFail(): - self.output += '\n' + "#"*80 + '\nTester failed, reason: ' + self.getStatusMessage() + '\n' + output += f'\nTester failed, reason: {self.getStatusMessage()}\n' + + self.setOutput(output) def getHPCPlace(self, options): """ diff --git a/python/TestHarness/util.py b/python/TestHarness/util.py index 5e5c119a9a35..5e8201430845 100644 --- a/python/TestHarness/util.py +++ b/python/TestHarness/util.py @@ -871,7 +871,7 @@ def deleteFilesAndFolders(test_dir, paths, delete_folders=True): # Trimming routines for job output def trimOutput(job, options): - output = job.getOutput() + output = job.getCombinedOutput(concatenate=True) if ((job.isFail() and options.no_trimmed_output_on_error) or (job.specs.isValid('max_buffer_size') and job.specs['max_buffer_size'] == -1) or options.no_trimmed_output): From 7fdbf396a2f3da2335826616fa518c9486ea6b35 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 23 Aug 2024 15:37:46 -0600 Subject: [PATCH 197/243] Bring back output_dir and use it --- python/TestHarness/TestHarness.py | 15 ++++++- python/TestHarness/schedulers/Job.py | 45 ++++++++++++++++---- python/TestHarness/schedulers/RunParallel.py | 4 +- 3 files changed, 53 insertions(+), 11 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index ece5f0f967b9..e1d1e5fba0da 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -839,6 +839,10 @@ def writeResults(self): # Additional data to store (overwrites any previous matching keys) self.options.results_storage[job.getTestDir()].update(job.getMetaData()) + if self.options.output_dir: + self.options.output_dir = os.path.abspath(self.options.output_dir) + self.options.results_file = os.path.join(self.options.output_dir, self.options.results_file) + if self.options.results_storage and self.options.results_file: try: with open(self.options.results_file, 'w') as data_file: @@ -858,7 +862,7 @@ def writeResults(self): try: # Write one file, with verbose information (--file) if self.options.file: - with open(self.options.file, 'w') as f: + with open(os.path.join(self.output_dir, self.options.file), 'w') as f: for job_group in all_jobs: for job in job_group: # Do not write information about silent tests @@ -928,6 +932,14 @@ def initialize(self, argv, app_name): mydir = os.path.dirname(os.path.realpath(__file__)) self.executable = os.path.join(mydir, '../../../..', 'bin', self.executable) + # Create the output dir if they ask for it. It is easier to ask for forgiveness than permission + if self.options.output_dir: + try: + os.makedirs(self.options.output_dir) + except OSError as ex: + if ex.errno == errno.EEXIST: pass + else: raise + # Use a previous results file, or declare the variable self.options.results_storage = {} if self.useExistingStorage(): @@ -1020,6 +1032,7 @@ def parseCLArgs(self, argv): outputgroup.add_argument('-q', '--quiet', action='store_true', dest='quiet', help='only show the result of every test, don\'t show test output even if it fails') outputgroup.add_argument('--no-report', action='store_false', dest='report_skipped', help='do not report skipped tests') outputgroup.add_argument('--show-directory', action='store_true', dest='show_directory', help='Print test directory path in out messages') + outputgroup.add_argument('-o', '--output-dir', nargs=1, metavar='directory', dest='output_dir', default='', help='Save all output files in the directory, and create it if necessary') outputgroup.add_argument('-f', '--file', nargs=1, action='store', dest='file', help='Write verbose output of each test to FILE and quiet output to terminal') outputgroup.add_argument('-x', '--sep-files', action='store_true', dest='sep_files', help='Write the output of each test to a separate file. Only quiet output to terminal.') outputgroup.add_argument('--include-input-file', action='store_true', dest='include_input', help='Include the contents of the input file when writing the results of a test to a file') diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 792e864e7836..7c617bd75ce0 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -135,6 +135,9 @@ class Job(OutputInterface): # Iterator for producing a unique Job ID id_iter = itertools.count() + # Thread lock for creating output directories + mkdir_lock = threading.Lock() + def __init__(self, tester, job_dag, options): OutputInterface.__init__(self) @@ -363,13 +366,6 @@ def run(self): """ tester = self.__tester - # Set the output path if its separate and initialize the output - if self.hasSeperateOutput(): - for name, object in self.getOutputObjects().items(): - output_path = self.getOutputPathPrefix() + f'.{name}_out.txt' - object.setSeparateOutputPath(output_path) - object.clearOutput() - # Start the main timer for running self.timer.startMain() @@ -379,6 +375,33 @@ def finalize(): self.cleanup() self.timer.stopMain() + # Set the output path if its separate and initialize the output + if self.hasSeperateOutput(): + # Create the directory for output if it's separate + if self.options.output_dir: + # To prevent jobs clobbering when making the directory + with Job.mkdir_lock: + job_output_dir = self.getOutputDirectory() + if not os.path.isdir(job_output_dir): + try: + os.makedirs(job_output_dir) + except OSError as ex: + if ex.errno == errno.EEXIST: + pass + else: + self.setStatus(self.error, f'DIRECTORY CREATION FAILURE') + self.appendOutput(f'Failed to create directory {job_output_dir}') + + # Failed to create the directory + if self.isError(): + finalize() + return + + for name, object in self.getOutputObjects().items(): + output_path = self.getOutputPathPrefix() + f'.{name}_out.txt' + object.setSeparateOutputPath(output_path) + object.clearOutput() + # Helper for trying and catching def try_catch(do, exception_name, timer_name): with self.timer.time(timer_name): @@ -546,13 +569,19 @@ def getRunner(self): """ Gets the Runner that actually runs the command """ return self._runner + def getOutputDirectory(self): + """ Get the directory for output for this job """ + if not self.options.output_dir: + return self.getTestDir() + return os.path.join(self.options.output_dir, self.getTestName()[:-len(self.getTestNameShort())-1]) + def getOutputPathPrefix(self): """ Returns a file prefix that is unique to this job Should be used for all TestHarness produced files for this job """ - return os.path.join(self.getTestDir(), self.getTestNameShort().replace(os.sep, '.')) + return os.path.join(self.getOutputDirectory(), self.getTestNameShort().replace(os.sep, '.')) def hasSeperateOutput(self): """ diff --git a/python/TestHarness/schedulers/RunParallel.py b/python/TestHarness/schedulers/RunParallel.py index e3837c2a1478..66c21a5c5358 100644 --- a/python/TestHarness/schedulers/RunParallel.py +++ b/python/TestHarness/schedulers/RunParallel.py @@ -62,8 +62,8 @@ def run(self, job): if not tester.isSkip() and not job.isFail(): self.setSuccessfulMessage(tester) except: - job.appendOutput(util.outputHeader('Python exception encountered in Job')) - job.appendOutput(traceback.format_exc()) + trace = traceback.format_exc() + job.appendOutput(util.outputHeader('Python exception encountered in Job') + trace) job.setStatus(job.error, 'JOB EXCEPTION') if job.hasSeperateOutput(): From 65be61c4cb2f4d82888a31a97722a0098af04a13 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 23 Aug 2024 16:03:09 -0600 Subject: [PATCH 198/243] Cleanup output, simplify ending comment --- python/TestHarness/runners/HPCRunner.py | 5 +++++ python/TestHarness/schedulers/Job.py | 2 +- python/TestHarness/schedulers/RunHPC.py | 2 +- python/TestHarness/schedulers/hpc_template | 14 +++++--------- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index 160e353c66e7..47afd43f5756 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -171,6 +171,11 @@ def print_files(files, type): if all in output: self.setOutput(output.replace(all, no_null)) + # Report queue time and output wait time + stats = f'Queue time = {timer.totalTime("hpc_queued"):.2f} s, ' + stats += f'output wait time = {timer.totalTime("hpc_wait_output"):.2f} s\n' + self.appendOutput(stats) + def kill(self): if self.hpc_job: self.run_hpc.killHPCJob(self.hpc_job) diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 7c617bd75ce0..61c0276d3659 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -545,7 +545,7 @@ def getCombinedOutput(self, concatenate=False): if wrapped_object_output[-1] != '\n': wrapped_object_output += '\n' # Add a footer after the output ends - wrapped_object_output += util.outputHeader(f'End {name} output', ending=False) + wrapped_object_output += '\n' + util.outputHeader(f'End {name} output', ending=False) output += wrapped_object_output else: diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 9cfd7d2ce6c4..ef60eb992ce9 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -813,7 +813,7 @@ def getOutputEndingComment(job_id) -> str: that are read in order to verify that the files are fully synced when reading during postprocessing. """ - return f'TESTHARNESS RUNHPC FILE TERMINATOR FOR {job_id}\n' + return f'TESTHARNESS RUNHPC FILE TERMINATOR FOR {job_id}' @staticmethod def parseMPICommand(command) -> str: diff --git a/python/TestHarness/schedulers/hpc_template b/python/TestHarness/schedulers/hpc_template index 8b8879e6b58d..3bb858be12d3 100644 --- a/python/TestHarness/schedulers/hpc_template +++ b/python/TestHarness/schedulers/hpc_template @@ -99,24 +99,20 @@ echo "walltime: $walltime" >> {{ RESULT }} ADDITIONAL_OUTPUT_FILES=({{ ADDITIONAL_OUTPUT_FILES }}) for file in ${ADDITIONAL_OUTPUT_FILES[@]}; do if [ ! -e "$file" ]; then - echo "Failed to finalize output $file" + echo "Failed to find output $file" continue fi - # No newline for binaries - if [[ $(file --mime-encoding ${file}) = *binary ]]; then - printf "{{ ENDING_COMMENT }}" >> $file; - # Newline for plain text - else - printf "\n{{ ENDING_COMMENT }}" >> $file; + printf "{{ ENDING_COMMENT }}" >> $file; + if [ $? != 0 ]; then + echo "Failed to finalize output $file" fi - echo "Finalized output $file" done # Append a recognizable string at the end of the output. We look # for this string when parsing the output so that we can be sure # that we have obtained all of the output -printf "\n{{ ENDING_COMMENT }}" +printf "{{ ENDING_COMMENT }}" # Exit with the real return code from the job that we ran exit $return_code From b112617a902242b6282236553bc802b1d1f8e386 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 23 Aug 2024 16:27:54 -0600 Subject: [PATCH 199/243] Print errors even with verbose --- python/TestHarness/TestHarness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index e1d1e5fba0da..70c834c515b7 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -605,7 +605,7 @@ def printOutput(self, job, color): """ Method to print a testers output to the screen """ output = '' # Print what ever status the tester has at the time - if self.options.verbose or (job.isFail() and not self.options.quiet): + if self.options.verbose or (job.isFail() and not self.options.quiet) or job.isError(): if job.getCommandRan(): command = job.getCommandRan() else: From b00453efc88d2e106183fe2bd906315583a9b0c9 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 23 Aug 2024 16:28:16 -0600 Subject: [PATCH 200/243] Create the output directories for job submissions, too --- python/TestHarness/schedulers/Job.py | 33 ++++++++++++++----------- python/TestHarness/schedulers/RunHPC.py | 5 ++++ 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 61c0276d3659..4fdd4f0d07e0 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -377,26 +377,15 @@ def finalize(): # Set the output path if its separate and initialize the output if self.hasSeperateOutput(): - # Create the directory for output if it's separate - if self.options.output_dir: - # To prevent jobs clobbering when making the directory - with Job.mkdir_lock: - job_output_dir = self.getOutputDirectory() - if not os.path.isdir(job_output_dir): - try: - os.makedirs(job_output_dir) - except OSError as ex: - if ex.errno == errno.EEXIST: - pass - else: - self.setStatus(self.error, f'DIRECTORY CREATION FAILURE') - self.appendOutput(f'Failed to create directory {job_output_dir}') + # Need to potentially create the output directory + self.createOutputDirectory() # Failed to create the directory if self.isError(): finalize() return + # Set the output path for each object for name, object in self.getOutputObjects().items(): output_path = self.getOutputPathPrefix() + f'.{name}_out.txt' object.setSeparateOutputPath(output_path) @@ -575,6 +564,22 @@ def getOutputDirectory(self): return self.getTestDir() return os.path.join(self.options.output_dir, self.getTestName()[:-len(self.getTestNameShort())-1]) + def createOutputDirectory(self): + """ Create the output directory for this job, if needed """ + if not self.options.output_dir: + return + output_dir = self.getOutputDirectory() + with Job.mkdir_lock: + if not os.path.isdir(output_dir): + try: + os.makedirs(output_dir) + except OSError as ex: + if ex.errno == errno.EEXIST: + pass + else: + self.setStatus(self.error, f'DIRECTORY CREATION FAILURE') + self.appendOutput(f'Failed to create Job directory {output_dir}') + def getOutputPathPrefix(self): """ Returns a file prefix that is unique to this job diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index ef60eb992ce9..b6254ec06d0c 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -308,6 +308,11 @@ def submitJob(self, job, hold): tester = job.getTester() options = self.options + # If we have a separate output directory, we might need to create this + # for the files that follow. This won't do anything if it exists and + # it is thread safe + job.createOutputDirectory() + submission_script = self.getHPCJobSubmissionPath(job) output_file = self.getHPCJobOutputPath(job) result_file = self.getHPCJobResultPath(job) From 25c44a3a4000be761309daaf55a7b2da08ee5e3b Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 23 Aug 2024 17:16:06 -0600 Subject: [PATCH 201/243] Separate runner run output as HPC output is already in a file --- python/TestHarness/OutputInterface.py | 19 +++-- python/TestHarness/runners/HPCRunner.py | 79 +++---------------- python/TestHarness/runners/Runner.py | 9 +++ .../TestHarness/runners/SubprocessRunner.py | 23 +++--- python/TestHarness/schedulers/Job.py | 3 +- 5 files changed, 47 insertions(+), 86 deletions(-) diff --git a/python/TestHarness/OutputInterface.py b/python/TestHarness/OutputInterface.py index 18b6111d3507..3447ef5e107f 100644 --- a/python/TestHarness/OutputInterface.py +++ b/python/TestHarness/OutputInterface.py @@ -26,17 +26,17 @@ def setSeparateOutputPath(self, separate_output_path): self.setOutput(self.output) self.output = '' - def getSeparateOutputFilePath(self): + def getSeparateOutputFilePath(self) -> str: """ Gets the path that this output is writing to, if any """ return self.separate_output_path - def hasOutput(self): + def hasOutput(self) -> bool: """ Whether or not this object has any content written """ if self.separate_output_path: return os.path.isfile(self.separate_output_path) - return len(self.output) + return len(self.output) > 0 - def getOutput(self): + def getOutput(self) -> str: """ Gets the underlying output, either from file or memory """ if self.separate_output_path: try: @@ -47,15 +47,19 @@ def getOutput(self): return self.output return '' - def setOutput(self, output): + def setOutput(self, output: str): """ Sets the output given some output string """ + if not output: + return if self.separate_output_path: open(self.separate_output_path, 'w').write(output) else: self.output = output - def appendOutput(self, output): + def appendOutput(self, output: str): """ Appends to the output """ + if not output: + return if self.separate_output_path: open(self.separate_output_path, 'a').write(output) else: @@ -64,6 +68,7 @@ def appendOutput(self, output): def clearOutput(self): """ Clears the output """ if self.separate_output_path: - open(self.separate_output_path, 'w').close() + if os.path.exists(self.separate_output_path): + os.remove(self.separate_output_path) else: self.output = '' diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index 47afd43f5756..69eef3edfbc7 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -30,10 +30,14 @@ def __init__(self, job, options, run_hpc): # Interval in seconds for polling for file completion self.file_completion_poll_time = 0.1 - # Whether or not the primary output has been loaded fully - self.output_completed = False - def spawn(self, timer): + # The runner_run output, which is the output from what we're + # actually running, already exists as a file. So just load + # it from that file instead and don't bother loading it + # into memory + hpc_job_output_path = self.run_hpc.getHPCJobOutputPath(self.job) + self.runner_output.setSeparateOutputPath(hpc_job_output_path) + # Rely on the RunHPC object to queue the job self.hpc_job = self.run_hpc.queueJob(self.job) @@ -61,20 +65,7 @@ def wait(self, timer): # If the Job is already finished, something happened in the # HPC scheduler so we have an invalid state for processing - # in the Tester if self.job.isFinished(): - # Don't bother if we've been killed - if self.hpc_job.isKilled(): - return - - # If we have _some_ output, at least try to load it. However, don't wait - # a while for this one. - for i in range(int(60 / self.file_completion_poll_time)): - if self.trySetOutput(): - break - time.sleep(self.file_completion_poll_time) - - # Don't bother looking for the rest of the output return tester = self.job.getTester() @@ -105,13 +96,8 @@ def wait(self, timer): # Check for file completeness for file in incomplete_files.copy(): if self.fileIsReady(file): - # Store the output - if file == output_file: - # It's now required because its complete - if not self.trySetOutput(required=True): - break # Store the result - elif file == result_file: + if file == result_file: with open(file, 'r') as f: result = yaml.safe_load(f) self.exit_code = result['exit_code'] @@ -129,11 +115,9 @@ def wait(self, timer): # We've waited for files for too long if (wait_files or incomplete_files) and waited_time >= self.options.hpc_file_timeout: self.job.setStatus(self.job.timeout, 'FILE TIMEOUT') - if not self.output_completed: - self.trySetOutput() def print_files(files, type): if files: - self.appendOutput(util.outputHeader(f'{type} output file(s):', ending=False)) + self.appendOutput(f'{type} output file(s)\n') self.appendOutput('\n'.join(files) + '\n') print_files(wait_files, 'Unavailable') print_files(incomplete_files, 'Incomplete') @@ -161,7 +145,7 @@ def print_files(files, type): # character check that happens in Runner.finalize() to still # be valid. if self.exit_code != 0 and self.job.getTester().hasOpenMPI(): - output = self.getOutput() + output = self.getRunOutput().getOutput() if output: prefix = '\n' null = '\0' @@ -169,53 +153,12 @@ def print_files(files, type): all = f'{prefix}{null}{suffix}' no_null = f'{prefix}{suffix}' if all in output: - self.setOutput(output.replace(all, no_null)) - - # Report queue time and output wait time - stats = f'Queue time = {timer.totalTime("hpc_queued"):.2f} s, ' - stats += f'output wait time = {timer.totalTime("hpc_wait_output"):.2f} s\n' - self.appendOutput(stats) + self.getRunOutput().setOutput(output.replace(all, no_null)) def kill(self): if self.hpc_job: self.run_hpc.killHPCJob(self.hpc_job) - def trySetOutput(self, required=False): - """ - Tries to set the output if it exists. - - If required is set, this will fail the job. - - Returns whether or not the output was set. - """ - # Whether or not we actually set it - did_set = False - - output_file = self.run_hpc.getHPCJobOutputPath(self.job) - if os.path.exists(output_file) and os.path.isfile(output_file): - try: - header = f'{self.run_hpc.getHPCSchedulerName()} job {self.hpc_job.id} output' - # If we're trying to parse output, we can't truncate it - # because it might appear in the truncated portion - if self.job.getTester().needFullOutput(self.options) or self.job.hasSeperateOutput(): - self.setOutput(open(output_file, 'r').read()) - # Not parsing the output, so just read it truncated - else: - self.setOutput(self.readTruncated(output_file)) - - did_set = True - except: - pass - - if did_set: - self.output_completed = True - else: - self.setOutput(f'Failed to load output file {output_file}\n') - if required: - self.job.setStatus(self.job.error, 'FAILED OUTPUT READ') - - return did_set - def fileIsReady(self, file): """ Checks if a file is ready for reading. diff --git a/python/TestHarness/runners/Runner.py b/python/TestHarness/runners/Runner.py index 970481481a37..c09abac3e990 100644 --- a/python/TestHarness/runners/Runner.py +++ b/python/TestHarness/runners/Runner.py @@ -27,6 +27,15 @@ def __init__(self, job, options): self.options = options # The job's exit code, should be set after wait() self.exit_code = None + # The output for the actual run of the job. We keep this + # separate from self.output in this Runner because HPC + # jobs always have a file output, so we want to store + # their output separately + self.run_output = OutputInterface() + + def getRunOutput(self): + """ Get the OutputInterface object for the actual run """ + return self.run_output def spawn(self, timer): """ diff --git a/python/TestHarness/runners/SubprocessRunner.py b/python/TestHarness/runners/SubprocessRunner.py index 3e8e9e54c915..e6feb681d17c 100644 --- a/python/TestHarness/runners/SubprocessRunner.py +++ b/python/TestHarness/runners/SubprocessRunner.py @@ -43,10 +43,10 @@ def spawn(self, timer): process_args = [cmd] process_kwargs = {'stdout': self.outfile, - 'stderr': self.errfile, - 'close_fds': False, - 'shell': use_shell, - 'cwd': tester.getTestDir()} + 'stderr': self.errfile, + 'close_fds': False, + 'shell': use_shell, + 'cwd': tester.getTestDir()} # On Windows, there is an issue with path translation when the command is passed in # as a list. if platform.system() == "Windows": @@ -79,23 +79,26 @@ def wait(self, timer): self.exit_code = self.process.poll() - self.clearOutput() + # This should have been cleared before the job started + if self.getRunOutput().hasOutput(): + raise Exception('Runner run output was not cleared') # Load combined output for file in [self.outfile, self.errfile]: file.flush() - file_output = self.readOutput(file) + output = self.readOutput(file) file.close() # For some reason openmpi will append a null character at the end # when the exit code is nonzero. Not sure why this is... but remove # it until we figure out what's broken if file == self.errfile and self.exit_code != 0 \ - and self.job.getTester().hasOpenMPI() and len(file_output) > 2 \ - and file_output == '\n\0\n': - file_output = file_output + and self.job.getTester().hasOpenMPI() and len(output) > 2 \ + and output[-3] == '\n\0\n': + output = output[:-3] - self.appendOutput(file_output) + if output: + self.getRunOutput().appendOutput(output) def kill(self): if self.process is not None: diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 4fdd4f0d07e0..0359b2b93eb6 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -495,6 +495,7 @@ def getOutputObjects(self) -> dict: """ objects = {} if self.getRunner(): + objects['runner_run'] = self.getRunner().getRunOutput() objects['runner'] = self.getRunner() objects['tester'] = self.getTester() objects['job'] = self @@ -540,7 +541,7 @@ def getCombinedOutput(self, concatenate=False): else: output[name] = wrapped_object_output elif not concatenate: - output[name] = '' + output[name] = None return output From ade6bc9bd29637d5f2126bcbe8e60f55494b3d1b Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 23 Aug 2024 17:18:55 -0600 Subject: [PATCH 202/243] Minor corrections --- python/TestHarness/runners/HPCRunner.py | 2 +- python/TestHarness/runners/SubprocessRunner.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index 69eef3edfbc7..dd162763880a 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -36,7 +36,7 @@ def spawn(self, timer): # it from that file instead and don't bother loading it # into memory hpc_job_output_path = self.run_hpc.getHPCJobOutputPath(self.job) - self.runner_output.setSeparateOutputPath(hpc_job_output_path) + self.getRunOutput().setSeparateOutputPath(hpc_job_output_path) # Rely on the RunHPC object to queue the job self.hpc_job = self.run_hpc.queueJob(self.job) diff --git a/python/TestHarness/runners/SubprocessRunner.py b/python/TestHarness/runners/SubprocessRunner.py index e6feb681d17c..bca187eeab21 100644 --- a/python/TestHarness/runners/SubprocessRunner.py +++ b/python/TestHarness/runners/SubprocessRunner.py @@ -97,8 +97,7 @@ def wait(self, timer): and output[-3] == '\n\0\n': output = output[:-3] - if output: - self.getRunOutput().appendOutput(output) + self.getRunOutput().appendOutput(output) def kill(self): if self.process is not None: From 1702dbdd0dceace1dd3630f0ed0ca890f5b0a231 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 23 Aug 2024 17:21:20 -0600 Subject: [PATCH 203/243] Wrap all entries in double quotes --- python/TestHarness/schedulers/RunHPC.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index b6254ec06d0c..2ea50f306c85 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -411,8 +411,12 @@ def submitJob(self, job, hold): # they are complete on the executing host additional_output = [result_file] for file in tester.getOutputFiles(options): - additional_output.append(f'"{os.path.join(tester.getTestDir(), file)}"') - submission_env['ADDITIONAL_OUTPUT_FILES'] = ' '.join(additional_output) + additional_output.append(os.path.join(tester.getTestDir(), file)) + # This is a bash array, need to wrap each entry in double quotes + additional_output_wrapped = [] + for entry in additional_output: + additional_output_wrapped.append(f'"{entry}"') + submission_env['ADDITIONAL_OUTPUT_FILES'] = ' '.join(additional_output_wrapped) # Let the derived scheduler add additional variables self.augmentJobSubmission(submission_env) From 7c0eb80efd41769c65741c75fcb7c258355f75b3 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 23 Aug 2024 17:25:08 -0600 Subject: [PATCH 204/243] Pass the correct output --- python/TestHarness/schedulers/Job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 0359b2b93eb6..b2bb513194b9 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -470,7 +470,7 @@ def try_catch(do, exception_name, timer_name): self.fileChecker.getNewTimes()) # Allow derived proccessResults to process the output and set a failing status (if it failed) - runner_output = self._runner.getOutput() + runner_output = self._runner.getRunOutput().getOutput() exit_code = self._runner.getExitCode() run_tester = lambda: tester.run(self.options, exit_code, runner_output) try_catch(run_tester, 'TESTER RUN', 'tester_run') From 6c761994068688f1ff67b795879b7063e7a0a68a Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 23 Aug 2024 17:28:31 -0600 Subject: [PATCH 205/243] Correct redirected output --- python/TestHarness/runners/Runner.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/python/TestHarness/runners/Runner.py b/python/TestHarness/runners/Runner.py index c09abac3e990..d87186826db2 100644 --- a/python/TestHarness/runners/Runner.py +++ b/python/TestHarness/runners/Runner.py @@ -8,7 +8,7 @@ #* https://www.gnu.org/licenses/lgpl-2.1.html import os, json -from TestHarness import OutputInterface +from TestHarness import OutputInterface, util class Runner(OutputInterface): """ @@ -68,15 +68,16 @@ def finalize(self): Finalizes the output, which should be called at the end of wait() """ # Load the redirected output files, if any + run_output = self.getRunOutput() for file_path in self.job.getTester().getRedirectedOutputFiles(self.options): - self.appendOutput(util.outputHeader(f'Begin redirected output {file_path}')) + self.run_output.appendOutput(util.outputHeader(f'Begin redirected output {file_path}')) if os.access(file_path, os.R_OK): with open(file_path, 'r+b') as f: - self.appendOutput(self.readOutput(f)) + self.run_output.appendOutput(self.readOutput(f)) else: self.job.setStatus(self.job.error, 'FILE TIMEOUT') - self.appendOutput('FILE UNAVAILABLE\n') - self.appendOutput(util.outputHeader(f'End redirected output {file_path}')) + self.appendOutput(f'File {file_path} unavailable') + self.run_output.appendOutput(util.outputHeader(f'End redirected output {file_path}')) # Check for invalid unicode in output output = self.getOutput() From 1026537b3a9ceca04146b0c7f5b7fa2b52c7dd54 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Sat, 24 Aug 2024 10:29:59 -0600 Subject: [PATCH 206/243] Add a few tests to run on HPC --- test/tests/kernels/simple_diffusion/tests | 3 +++ .../meshgenerators/distributed_rectilinear/generator/tests | 2 ++ 2 files changed, 5 insertions(+) diff --git a/test/tests/kernels/simple_diffusion/tests b/test/tests/kernels/simple_diffusion/tests index 4a82308a779e..0b9c12b0e446 100644 --- a/test/tests/kernels/simple_diffusion/tests +++ b/test/tests/kernels/simple_diffusion/tests @@ -7,5 +7,8 @@ issues = '#1493' design = 'kernels/Diffusion.md' requirement = 'The system shall run a simple 2D linear diffusion problem with Dirichlet boundary conditions on a regular mesh.' + + # Enables running the limited HPC tests on CIVET on all events + group = 'hpc' [] [] diff --git a/test/tests/meshgenerators/distributed_rectilinear/generator/tests b/test/tests/meshgenerators/distributed_rectilinear/generator/tests index bc80d0e11f78..2b6094159e67 100644 --- a/test/tests/meshgenerators/distributed_rectilinear/generator/tests +++ b/test/tests/meshgenerators/distributed_rectilinear/generator/tests @@ -33,6 +33,8 @@ cli_args = 'Mesh/gmg/dim=3 Mesh/gmg/nx=20 Mesh/gmg/ny=20 Mesh/gmg/nz=20 Outputs/file_base=distributed_rectilinear_mesh_generator_out_3d Outputs/hide="pid npid" ' requirement = 'The system shall be able to generate 3D HEX8 mesh in parallel.' valgrind = 'NONE' + # Enables running the limited HPC tests on CIVET on all events + group = 'hpc' [../] [./3D_ptscotch] From e08c55ac89368bdf167d7265d33e2aa211546cec Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Sat, 24 Aug 2024 10:30:49 -0600 Subject: [PATCH 207/243] Remove SILENT stipulation --- python/TestHarness/TestHarness.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 70c834c515b7..a9530a01a3d5 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -810,9 +810,6 @@ def writeResults(self): for job in job_group: status, message, message_color, status_code, sort_value = job.getJointStatus() - if status == 'SILENT': - continue - # Create empty key based on TestDir, or re-inialize with existing data so we can append to it self.options.results_storage[job.getTestDir()] = self.options.results_storage.get(job.getTestDir(), {}) From 9203c5be7340ce6cf5182b472c98266755eb3b46 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Sat, 24 Aug 2024 11:07:27 -0600 Subject: [PATCH 208/243] Add HPC information and time information to result file --- python/TestHarness/TestHarness.py | 21 +++++++++++++++++---- python/TestHarness/schedulers/RunHPC.py | 7 ++++++- python/TestHarness/schedulers/Scheduler.py | 8 ++++++-- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index a9530a01a3d5..4d929904fd13 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -15,6 +15,8 @@ from . import RaceChecker import subprocess import shutil +import socket +import datetime from socket import gethostname from FactorySystem.Factory import Factory @@ -364,7 +366,7 @@ def __init__(self, argv, moose_dir, app_name=None, moose_python=None): def findAndRunTests(self, find_only=False): self.error_code = 0x0 self.preRun() - self.start_time = clock() + self.start_time = datetime.datetime.now() launched_tests = [] if self.options.input_file_name != '': self._infiles = self.options.input_file_name.split(',') @@ -675,7 +677,7 @@ def cleanup(self): for (job, sort_value, timing) in sorted(self.test_table, key=lambda x: x[1]): print((util.formatResult(job, self.options, caveats=True))) - time = clock() - self.start_time + time = datetime.datetime.now() - self.start_time print(('-' * (self.options.term_cols))) @@ -702,10 +704,13 @@ def cleanup(self): else: timing_max = 0 timing_avg = 0 - print(('Ran %d tests in %.1f seconds. Average test time %.1f seconds, maximum test time %.1f seconds.' % (self.num_passed+self.num_failed, time, timing_avg, timing_max))) + summary = f'Ran {self.num_passed + self.num_failed} tests in {time.total_seconds():.1f} seconds.' + summary += f' Average test time {timing_avg:.1f} seconds,' + summary += f' maximum test time {timing_max:.1f} seconds.' + print(summary) # Get additional results from the scheduler - scheduler_summary = self.scheduler.additionalResultSummary() + scheduler_summary = self.scheduler.appendResultFooter() if scheduler_summary: print(scheduler_summary) @@ -805,6 +810,14 @@ def writeResults(self): # Record the Scheduler Plugin used self.options.results_storage['SCHEDULER'] = self.scheduler.__class__.__name__ + # Record information on the host we can ran on + self.options.results_storage['HOSTNAME'] = socket.gethostname() + self.options.results_storage['USER'] = os.getlogin() + self.options.results_storage['TIME'] = self.start_time.strftime('%Y-%m-%d %H:%M:%S') + + # Record any additional data from the cheduler + self.options.results_storage.update(self.scheduler.appendResultFileHeader()) + # Write some useful data to our results_storage for job_group in all_jobs: for job in job_group: diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 2ea50f306c85..e03d3b2100dc 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -855,7 +855,7 @@ def waitFinish(self): functor = lambda hpc_job: hpc_job.state == hpc_job.State.held self.killHPCJobs(functor) - def additionalResultSummary(self): + def appendResultFooter(self): timer_keys = ['hpc_queued', 'hpc_wait_output'] times = {} for key in timer_keys: @@ -874,3 +874,8 @@ def additionalResultSummary(self): result = f'Average queue time {averages["hpc_queued"]:.1f} seconds, ' result += f'average output wait time {averages["hpc_wait_output"]:.1f} seconds.' return result + + def appendResultFileHeader(self): + hpc_entry = {'scheduler': self.options.hpc, + 'hosts': self.options.hpc_host if isinstance(self.options.hpc_host, list) else [self.options.hpc_host]} + return {'HPC': hpc_entry} diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index 2c5d430adc4f..2044c06c7b8c 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -548,6 +548,10 @@ def runJob(self, job, jobs): except KeyboardInterrupt: self.killRemaining(keyboard=True) - def additionalResultSummary(self): - """ Entrypoint to add additional results to the final summary """ + def appendResultFooter(self) -> str or None: + """ Entrypoint to add additional results to the on screen result footer """ return None + + def appendResultFileHeader(self) -> dict: + """ Entrypoint to add entries to the result file header """ + return {} From 1d60c5d46842f7ecf28dfed9f3199d6a7933c619 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Sat, 24 Aug 2024 11:38:23 -0600 Subject: [PATCH 209/243] Add more HPC context to the results file, clean up dirty files --- python/TestHarness/TestHarness.py | 38 ++++++++++++-------- python/TestHarness/schedulers/Job.py | 7 ++++ python/TestHarness/schedulers/RunHPC.py | 19 ++++++++-- python/TestHarness/schedulers/RunParallel.py | 14 ++++---- python/TestHarness/schedulers/Scheduler.py | 4 +++ 5 files changed, 58 insertions(+), 24 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 4d929904fd13..db786bb70471 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -825,29 +825,37 @@ def writeResults(self): # Create empty key based on TestDir, or re-inialize with existing data so we can append to it self.options.results_storage[job.getTestDir()] = self.options.results_storage.get(job.getTestDir(), {}) + test_dir_entry = self.options.results_storage[job.getTestDir()] # Output that isn't in a file (no --sep-files) output = job.getCombinedOutput() if not self.options.sep_files else None # Output that is in a file (--sep-files) output_files = job.getCombinedSeparateOutputPaths() if self.options.sep_files else None - self.options.results_storage[job.getTestDir()][job.getTestName()] = {'NAME' : job.getTestNameShort(), - 'LONG_NAME' : job.getTestName(), - 'TIMING' : job.timer.totalTimes(), - 'STATUS' : status, - 'STATUS_MESSAGE' : message, - 'FAIL' : job.isFail(), - 'COLOR' : message_color, - 'CAVEATS' : list(job.getCaveats()), - 'OUTPUT' : output, - 'OUTPUT_FILES' : output_files, - 'TESTER_OUTPUT_FILES' : job.getOutputFiles(self.options), - 'INPUT_FILE' : job.getInputFile(), - 'COMMAND' : job.getCommand(), - 'META_DATA' : job.getMetaData()} + # Initialize entry for this job + test_dir_entry[job.getTestName()] = {} + job_entry = test_dir_entry[job.getTestName()] + job_data = {'NAME' : job.getTestNameShort(), + 'LONG_NAME' : job.getTestName(), + 'TIMING' : job.timer.totalTimes(), + 'STATUS' : status, + 'STATUS_MESSAGE' : message, + 'FAIL' : job.isFail(), + 'COLOR' : message_color, + 'CAVEATS' : list(job.getCaveats()), + 'OUTPUT' : output, + 'OUTPUT_FILES' : output_files, + 'TESTER_OUTPUT_FILES' : job.getOutputFiles(self.options), + 'INPUT_FILE' : job.getInputFile(), + 'COMMAND' : job.getCommand(), + 'META_DATA' : job.getMetaData()} + job_entry.update(job_data) + + # Additional data from the scheduler for this job + job_entry.update(self.scheduler.appendResultFileJob(job)) # Additional data to store (overwrites any previous matching keys) - self.options.results_storage[job.getTestDir()].update(job.getMetaData()) + test_dir_entry.update(job.getMetaData()) if self.options.output_dir: self.options.output_dir = os.path.abspath(self.options.output_dir) diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index b2bb513194b9..2e35be6836a8 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -335,6 +335,13 @@ def getUniquePrereqs(self): unique_prereqs.append(os.path.join(self.getTestDir(), prereq)) return unique_prereqs + def addDirtyFiles(self, files): + dirty_files = self.getMetaData().get('DIRTY_FILES', []) + for file in files: + if file not in dirty_files: + dirty_files.append(file) + self.addMetaData(DIRTY_FILES=dirty_files) + def addMetaData(self, **kwargs): """ Allow derived methods to store additional data which ends up in the data storage file """ for key, value in kwargs.items(): diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index e03d3b2100dc..4a2598b25b3b 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -431,6 +431,9 @@ def submitJob(self, job, hold): # Write the script open(submission_script, 'w').write(script) + # Add our output to dirty files + job.addDirtyFiles([submission_script, output_file, result_file]) + # Submission command. Here we have a simple bash loop # that will try to wait for the file if it doesn't exist yet submission_command = self.getHPCSubmissionCommand() @@ -876,6 +879,16 @@ def appendResultFooter(self): return result def appendResultFileHeader(self): - hpc_entry = {'scheduler': self.options.hpc, - 'hosts': self.options.hpc_host if isinstance(self.options.hpc_host, list) else [self.options.hpc_host]} - return {'HPC': hpc_entry} + entry = {'scheduler': self.options.hpc, + 'pre_source_file': self.options.hpc_pre_source, + 'pre_source': self.source_contents, + 'hosts': self.options.hpc_host if isinstance(self.options.hpc_host, list) else [self.options.hpc_host]} + return {'HPC': entry} + + def appendResultFileJob(self, job): + hpc_job = self.hpc_jobs.get(job.getID()) + if not hpc_job: + return {} + entry = {'id': hpc_job.id, + 'submission_script': self.getHPCJobSubmissionPath(job)} + return {'HPC': entry} diff --git a/python/TestHarness/schedulers/RunParallel.py b/python/TestHarness/schedulers/RunParallel.py index 66c21a5c5358..9f6df94e6a24 100644 --- a/python/TestHarness/schedulers/RunParallel.py +++ b/python/TestHarness/schedulers/RunParallel.py @@ -66,12 +66,14 @@ def run(self, job): job.appendOutput(util.outputHeader('Python exception encountered in Job') + trace) job.setStatus(job.error, 'JOB EXCEPTION') - if job.hasSeperateOutput(): - dirty_files = [] - for object in job.getOutputObjects().values(): - if object.hasOutput(): - dirty_files.append(object.getSeparateOutputFilePath()) - job.addMetaData(DIRTY_FILES=dirty_files) + # Add the separate output as dirty if we have any + dirty_files = [] + for object in job.getOutputObjects().values(): + if object.hasOutput(): + output_file = object.getSeparateOutputFilePath() + if output_file: + dirty_files.append(output_file) + job.addDirtyFiles(dirty_files) def buildRunner(self, job, options) -> Runner: """Builds the runner for a given tester diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index 2044c06c7b8c..5092e80d9c73 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -555,3 +555,7 @@ def appendResultFooter(self) -> str or None: def appendResultFileHeader(self) -> dict: """ Entrypoint to add entries to the result file header """ return {} + + def appendResultFileJob(self, job) -> dict: + """ Entrypoint to add entries to the result file for a job """ + return {} From e99db0ff4a71d656e355d9cf759f04b7399880ca Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Sat, 24 Aug 2024 14:54:15 -0600 Subject: [PATCH 210/243] Unify results storage --- python/TestHarness/TestHarness.py | 275 +++++++++++-------- python/TestHarness/schedulers/Job.py | 83 ++++-- python/TestHarness/schedulers/RunHPC.py | 19 +- python/TestHarness/schedulers/RunParallel.py | 12 +- 4 files changed, 227 insertions(+), 162 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index db786bb70471..70b117179d92 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -244,7 +244,6 @@ def __init__(self, argv, moose_dir, app_name=None, moose_python=None): self.code = b'2d2d6769726c2d6d6f6465' self.error_code = 0x0 self.keyboard_talk = True - self.results_file = '.previous_test_results.json' # Assume libmesh is a peer directory to MOOSE if not defined if "LIBMESH_DIR" in os.environ: self.libmesh_dir = os.environ['LIBMESH_DIR'] @@ -258,6 +257,13 @@ def __init__(self, argv, moose_dir, app_name=None, moose_python=None): # Parse arguments self.parseCLArgs(argv) + # Setup absolute paths and output paths + if self.options.output_dir: + self.options.output_dir = os.path.abspath(self.options.output_dir) + self.options.results_file = os.path.join(self.options.output_dir, self.options.results_file) + else: + self.options.results_file = os.path.abspath(self.options.results_file) + checks = {} checks['platform'] = util.getPlatforms() checks['machine'] = util.getMachine() @@ -780,102 +786,26 @@ def cleanup(self): print(str(group[0]).ljust((self.options.term_cols - (len(group[1]) + 4)), ' '), f'[{group[1]}s]') print('\n') - # Perform any write-to-disc operations - self.writeResults() + all_jobs = self.scheduler.retrieveJobs() - def writeResults(self): - """ Don't update the results file when using the --failed-tests argument """ - if self.options.failed_tests or self.options.show_last_run: - return - - """ write test results to disc in some fashion the user has requested """ - all_jobs = self.scheduler.retrieveJobs() - - # Gather and print the jobs with race conditions after the jobs are finished - # and only run when running --diag. - if self.options.pedantic_checks: - checker = RaceChecker.RaceChecker(all_jobs) - if checker.findRacePartners(): - # Print the unique racer conditions and adjust our error code. - self.error_code = checker.printUniqueRacerSets() - else: - print("There are no race conditions.") - - # Record the input file name that was used - self.options.results_storage['INPUT_FILE_NAME'] = self.options.input_file_name - - # Record that we are using --sep-files - self.options.results_storage['SEP_FILES'] = self.options.sep_files - - # Record the Scheduler Plugin used - self.options.results_storage['SCHEDULER'] = self.scheduler.__class__.__name__ - - # Record information on the host we can ran on - self.options.results_storage['HOSTNAME'] = socket.gethostname() - self.options.results_storage['USER'] = os.getlogin() - self.options.results_storage['TIME'] = self.start_time.strftime('%Y-%m-%d %H:%M:%S') - - # Record any additional data from the cheduler - self.options.results_storage.update(self.scheduler.appendResultFileHeader()) - - # Write some useful data to our results_storage - for job_group in all_jobs: - for job in job_group: - status, message, message_color, status_code, sort_value = job.getJointStatus() - - # Create empty key based on TestDir, or re-inialize with existing data so we can append to it - self.options.results_storage[job.getTestDir()] = self.options.results_storage.get(job.getTestDir(), {}) - test_dir_entry = self.options.results_storage[job.getTestDir()] - - # Output that isn't in a file (no --sep-files) - output = job.getCombinedOutput() if not self.options.sep_files else None - # Output that is in a file (--sep-files) - output_files = job.getCombinedSeparateOutputPaths() if self.options.sep_files else None - - # Initialize entry for this job - test_dir_entry[job.getTestName()] = {} - job_entry = test_dir_entry[job.getTestName()] - job_data = {'NAME' : job.getTestNameShort(), - 'LONG_NAME' : job.getTestName(), - 'TIMING' : job.timer.totalTimes(), - 'STATUS' : status, - 'STATUS_MESSAGE' : message, - 'FAIL' : job.isFail(), - 'COLOR' : message_color, - 'CAVEATS' : list(job.getCaveats()), - 'OUTPUT' : output, - 'OUTPUT_FILES' : output_files, - 'TESTER_OUTPUT_FILES' : job.getOutputFiles(self.options), - 'INPUT_FILE' : job.getInputFile(), - 'COMMAND' : job.getCommand(), - 'META_DATA' : job.getMetaData()} - job_entry.update(job_data) - - # Additional data from the scheduler for this job - job_entry.update(self.scheduler.appendResultFileJob(job)) - - # Additional data to store (overwrites any previous matching keys) - test_dir_entry.update(job.getMetaData()) - - if self.options.output_dir: - self.options.output_dir = os.path.abspath(self.options.output_dir) - self.options.results_file = os.path.join(self.options.output_dir, self.options.results_file) - - if self.options.results_storage and self.options.results_file: - try: - with open(self.options.results_file, 'w') as data_file: - json.dump(self.options.results_storage, data_file, indent=2) - except UnicodeDecodeError: - print('\nERROR: Unable to write results due to unicode decode/encode error') + # Gather and print the jobs with race conditions after the jobs are finished + # and only run when running --pedantic-checks. + if self.options.pedantic_checks: + checker = RaceChecker.RaceChecker(all_jobs) + if checker.findRacePartners(): + # Print the unique racer conditions and adjust our error code. + self.error_code = checker.printUniqueRacerSets() + else: + print("There are no race conditions.") - # write to a plain file to aid in reproducing error - with open(self.options.results_file + '.unicode_error' , 'w') as f: - f.write(self.options.results_storage) + if not self.useExistingStorage(): + # Store the results from each job + for job_group in all_jobs: + for job in job_group: + self.storeJobResults(job) - sys.exit(1) - except IOError: - print('\nERROR: Unable to write results due to permissions') - sys.exit(1) + # And write the results + self.writeResults(complete=True) try: # Write one file, with verbose information (--file) @@ -909,6 +839,130 @@ def determineScheduler(self): # The default scheduler plugin return 'RunParallel' + def initializeResults(self): + """ Initializes the results storage + + If using existing storage, this will load the previous storage. + + If not using existing storage, this will: + - Delete the previous storage, if any + - Setup the header for the storage + - Write the incomplete storage to file + """ + if self.useExistingStorage(): + if not os.path.exists(self.options.results_file): + print(f'The previous run {self.options.results_file} does not exist') + sys.exit(1) + try: + with open(self.options.results_file, 'r') as f: + self.options.results_storage = json.load(f) + except: + print(f'ERROR: Failed to load result {self.options.results_file}') + raise + + if self.options.results_storage['INCOMPLETE']: + print(f'ERROR: The previous result {self.options.results_file} is incomplete!') + sys.exit(1) + + # Adhere to previous input file syntax, or set the default + _input_file_name = 'tests' + if self.options.input_file_name: + _input_file_name = self.options.input_file_name + self.options.input_file_name = self.options.results_storage.get('INPUT_FILE_NAME', _input_file_name) + + # Done working with existing storage + return + + # Remove the old one if it exists + if os.path.exists(self.options.results_file): + os.remove(self.options.results_file) + + # Not using previous or previous failed, initialize a new one + self.options.results_storage = {} + storage = self.options.results_storage + + # Record the input file name that was used + storage['INPUT_FILE_NAME'] = self.options.input_file_name + + # Record that we are using --sep-files + storage['SEP_FILES'] = self.options.sep_files + + # Record the Scheduler Plugin used + storage['SCHEDULER'] = self.scheduler.__class__.__name__ + + # Record information on the host we can ran on + storage['HOSTNAME'] = socket.gethostname() + storage['USER'] = os.getlogin() + + # Record when the run began + storage['TIME'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + # Record any additional data from the scheduler + storage.update(self.scheduler.appendResultFileHeader()) + + # Record whether or not the storage is incomplete + storage['INCOMPLETE'] = True + + # Write the headers + self.writeResults() + + def writeResults(self, complete=False): + """ Forcefully write the current results to file + + Will not do anything if using existing storage. + """ + # Not writing results + if self.useExistingStorage(): + raise Exception('Should not write results') + + # Make it as complete (run is done) + self.options.results_storage['INCOMPLETE'] = not complete + + # Store to a temporary file so that we always have a working file + file = self.options.results_file + file_in_progress = self.options.results_file + '.inprogress' + try: + with open(file_in_progress, 'w') as data_file: + json.dump(self.options.results_storage, data_file, indent=2) + except UnicodeDecodeError: + print(f'\nERROR: Unable to write results {file_in_progress} due to unicode decode/encode error') + + # write to a plain file to aid in reproducing error + with open(file + '.unicode_error' , 'w') as f: + f.write(self.options.results_storage) + + raise + except IOError: + print(f'\nERROR: Unable to write results {file_in_progress} due to permissions') + raise + + # Replace the file now that it's complete + try: + os.replace(file_in_progress, file) + except: + print(f'\nERROR: Failed to move in progress results {file_in_progress} to {file}') + raise + + def storeJobResults(self, job): + """ Stores the results from a job to the results storage """ + # Nothing to store + if self.useExistingStorage(): + raise Exception('Should not store job results') + + # Get the job's result out of the thread lock + job_results = job.getResults() + + # Create empty key based on TestDir, or re-inialize with existing data so we can append to it + self.options.results_storage[job.getTestDir()] = self.options.results_storage.get(job.getTestDir(), {}) + test_dir_entry = self.options.results_storage[job.getTestDir()] + + # Initialize entry for this job + test_dir_entry[job.getTestName()] = job_results + test_dir_entry[job.getTestName()].update(self.scheduler.appendResultFileJob(job)) + + # Additional data to store (overwrites any previous matching keys) + test_dir_entry.update(job.getMetaData()) + def initialize(self, argv, app_name): # Load the scheduler plugins plugin_paths = [os.path.join(self.moose_dir, 'python', 'TestHarness'), os.path.join(self.moose_dir, 'share', 'moose', 'python', 'TestHarness')] @@ -926,6 +980,7 @@ def initialize(self, argv, app_name): # Create the scheduler self.scheduler = self.factory.create(scheduler_plugin, self, plugin_params) + # Now that the scheduler is setup, initialize the results storage # Save executable-under-test name to self.executable exec_suffix = 'Windows' if platform.system() == 'Windows' else '' self.executable = app_name + '-' + self.options.method + exec_suffix @@ -958,36 +1013,12 @@ def initialize(self, argv, app_name): if ex.errno == errno.EEXIST: pass else: raise - # Use a previous results file, or declare the variable - self.options.results_storage = {} - if self.useExistingStorage(): - with open(self.options.results_file, 'r') as f: - try: - self.options.results_storage = json.load(f) - - # Adhere to previous input file syntax, or set the default - _input_file_name = 'tests' - if self.options.input_file_name: - _input_file_name = self.options.input_file_name - self.options.input_file_name = self.options.results_storage.get('INPUT_FILE_NAME', _input_file_name) - - except ValueError: - # This is a hidden file, controled by the TestHarness. So we probably shouldn't error - # and exit. Perhaps a warning instead, and create a new file? Down the road, when - # we use this file for PBS etc, this should probably result in an exception. - print(('INFO: Previous %s file is damaged. Creating a new one...' % (self.results_storage))) + # Initialize the results storage or load the previous results + self.initializeResults() def useExistingStorage(self): """ reasons for returning bool if we should use a previous results_storage file """ - if (os.path.exists(self.options.results_file) - and (self.options.failed_tests or self.options.show_last_run)): - return True - elif ((self.options.failed_tests or self.options.show_last_run) - and not os.path.exists(self.options.results_file)): - print('A previous run does not exist') - sys.exit(1) - elif os.path.exists(self.options.results_file): - os.remove(self.options.results_file) + return self.options.failed_tests or self.options.show_last_run ## Parse command line options and assign them to self.options def parseCLArgs(self, argv): @@ -1060,7 +1091,7 @@ def parseCLArgs(self, argv): outputgroup.add_argument("--dump", action="store_true", dest="dump", help="Dump the parameters for the testers in GetPot Format") outputgroup.add_argument("--no-trimmed-output", action="store_true", dest="no_trimmed_output", help="Do not trim the output") outputgroup.add_argument("--no-trimmed-output-on-error", action="store_true", dest="no_trimmed_output_on_error", help="Do not trim output for tests which cause an error") - outputgroup.add_argument("--results-file", nargs=1, default=self.results_file, help="Save run_tests results to an alternative json file (default: %(default)s)") + outputgroup.add_argument("--results-file", nargs=1, default='.previous_test_results.json', help="Save run_tests results to an alternative json file (default: %(default)s)") outputgroup.add_argument("--show-last-run", action="store_true", dest="show_last_run", help="Display previous results without executing tests again") # Options for HPC execution diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 2e35be6836a8..b83933ca8657 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -335,7 +335,8 @@ def getUniquePrereqs(self): unique_prereqs.append(os.path.join(self.getTestDir(), prereq)) return unique_prereqs - def addDirtyFiles(self, files): + def addDirtyFiles(self, files: list): + """ Adds the given files as dirty for this job """ dirty_files = self.getMetaData().get('DIRTY_FILES', []) for file in files: if file not in dirty_files: @@ -552,16 +553,6 @@ def getCombinedOutput(self, concatenate=False): return output - def setPreviousOutputs(self, outputs): - """ Sets outputs from a previous run of this Job """ - for name, object in self.getOutputObjects().items(): - object.setOutput(outputs[name]) - - def setPreviousSeparateOutputs(self, output_paths): - """ Sets --sep-files outputs from a previous run of this Job """ - for name, object in self.getOutputObjects().items(): - object.setSeparateOutputPath(output_paths[name]) - def getRunner(self): """ Gets the Runner that actually runs the command """ return self._runner @@ -604,18 +595,6 @@ def hasSeperateOutput(self): """ return self.options.sep_files - - def setPreviousTimer(self, timer_dict): - """ - Allow arbitrary timer times to be set. This is used by the QueueManager - to set the time as recorded by a previous TestHarness instance. - """ - self.timer.reset() - time_now = Timer.time_now() - for name, total_time in timer_dict.items(): - self.timer.start(name, time_now) - self.timer.stop(name, time_now + total_time) - def getTiming(self): """ Return active time if available, if not return a comparison of start and end time """ # Actual execution time @@ -709,3 +688,61 @@ def getJointStatus(self): self.__tester.getStatus().color, self.__tester.getStatus().code, self.__tester.getStatus().sort_value) + + def getResults(self): + """ Gets the results for this job for the results storage """ + status, message, message_color, _, _ = self.getJointStatus() + + # Output that isn't in a file (no --sep-files) + output = self.getCombinedOutput() if not self.hasSeperateOutput() else None + # Output that is in a file (--sep-files) + output_files = self.getCombinedSeparateOutputPaths() if self.hasSeperateOutput() else None + + job_data = {'NAME' : self.getTestNameShort(), + 'LONG_NAME' : self.getTestName(), + 'TIMING' : self.timer.totalTimes(), + 'STATUS' : status, + 'STATUS_MESSAGE' : message, + 'FAIL' : self.isFail(), + 'COLOR' : message_color, + 'CAVEATS' : list(self.getCaveats()), + 'OUTPUT' : output, + 'OUTPUT_FILES' : output_files, + 'TESTER_OUTPUT_FILES' : self.getOutputFiles(self.options), + 'INPUT_FILE' : self.getInputFile(), + 'COMMAND' : self.getCommand(), + 'META_DATA' : self.getMetaData()} + return job_data + + def loadPreviousResults(self): + """ Loads the previous results for this job for the results storage """ + try: + test_results = self.options.results_storage[self.getTestDir()][self.getTestName()] + except KeyError: + print(f'ERROR: {self.getTestName()} is missing in {self.options.results_file}') + sys.exit(1) + + # Set the tester status + tester = self.getTester() + status, message, caveats = self.previousTesterStatus(self.options, self.options.results_storage) + tester.setStatus(status, message) + if caveats: + tester.addCaveats(caveats) + + # Set the previous times + self.timer.reset() + time_now = Timer.time_now() + for name, total_time in test_results['TIMING'].items(): + self.timer.start(name, time_now) + self.timer.stop(name, time_now + total_time) + + # Set the previous --sep-files outputs, if any + if self.options.results_storage['SEP_FILES']: + output_paths = test_results['OUTPUT_FILES'] + for name, object in self.getOutputObjects().items(): + object.setSeparateOutputPath(output_paths[name]) + # Otherwise, set the previous actual outputs + else: + outputs = test_results['OUTPUT'] + for name, object in self.getOutputObjects().items(): + object.setOutput(outputs[name]) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 4a2598b25b3b..90e7010e4911 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -141,11 +141,12 @@ def __init__(self, harness, params): except: pass - # Make sure that we can call commands up front - for val in self.CallHPCPoolType: - if self.options.hpc_no_hold and val == self.CallHPCPoolType.queue: - continue - self.callHPC(val, 'hostname') + # Make sure that we can call commands up front, only if we're not re-running + if not self.options.show_last_run: + for val in self.CallHPCPoolType: + if self.options.hpc_no_hold and val == self.CallHPCPoolType.queue: + continue + self.callHPC(val, 'hostname') # Pool for submitJob(), so that we can submit jobs to be # held in the background without blocking @@ -739,10 +740,14 @@ def killHPCJobs(self, functor): if job_ids: self.callHPC(self.CallHPCPoolType.kill, f'{self.getHPCCancelCommand()} {" ".join(job_ids)}') + return len(job_ids) + def killRemaining(self, keyboard=False): """Kills all currently running HPC jobs""" functor = lambda hpc_job: hpc_job.state not in [hpc_job.State.killed, hpc_job.State.done] - self.killHPCJobs(functor) + killed_jobs = self.killHPCJobs(functor) + if keyboard and killed_jobs: + print(f'\nAttempted to kill remaining {killed_jobs} HPC jobs...') super().killRemaining(keyboard) def getHPCSchedulerName(self): @@ -888,7 +893,7 @@ def appendResultFileHeader(self): def appendResultFileJob(self, job): hpc_job = self.hpc_jobs.get(job.getID()) if not hpc_job: - return {} + return {'HPC': None} entry = {'id': hpc_job.id, 'submission_script': self.getHPCJobSubmissionPath(job)} return {'HPC': entry} diff --git a/python/TestHarness/schedulers/RunParallel.py b/python/TestHarness/schedulers/RunParallel.py index 9f6df94e6a24..79bedd8feeeb 100644 --- a/python/TestHarness/schedulers/RunParallel.py +++ b/python/TestHarness/schedulers/RunParallel.py @@ -39,17 +39,9 @@ def run(self, job): if self.options.dry_run: self.setSuccessfulMessage(tester) return + # Load results from a previous run elif self.options.show_last_run: - job_results = self.options.results_storage[job.getTestDir()][job.getTestName()] - status, message, caveats = job.previousTesterStatus(self.options, self.options.results_storage) - tester.setStatus(status, message) - if caveats: - tester.addCaveats(caveats) - job.setPreviousTimer(job_results['TIMING']) - if self.options.results_storage['SEP_FILES']: - job.setPreviousSeparateOutputs(job_results['OUTPUT_FILES']) - else: - job.setPreviousOutputs(job_results['OUTPUT']) + job.loadPreviousResults() return # Anything that throws while running or processing a job should be caught From f11434422dd140fa487f93a103e0bb79e4b3b0e7 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Sat, 24 Aug 2024 14:54:28 -0600 Subject: [PATCH 211/243] Don't submit jobs when we're doing a dry run --- python/TestHarness/schedulers/RunHPC.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 90e7010e4911..72ab7867e8e7 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -707,7 +707,7 @@ def augmentJobs(self, jobs): # a chance to get to this in the pool, when it finally # executes in the pool, it will do nothing because the # HPCJob will already exist. - if not self.options.hpc_no_hold: + if not self.options.hpc_no_hold and not self.options.dry_run: self.submit_job_pool.apply_async(self.submitJob, (job, True,)) def killHPCJob(self, hpc_job, lock=True): From b0bc5b394ffe28ad8a724465fd5b0e79ebe046bd Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Sat, 24 Aug 2024 16:21:50 -0600 Subject: [PATCH 212/243] Move test results into TESTS key, cleanup access --- python/TestHarness/JobDAG.py | 4 +- python/TestHarness/TestHarness.py | 25 ++--------- python/TestHarness/schedulers/Job.py | 63 ++++++++++++++++------------ python/TestHarness/testers/Tester.py | 41 +++++++++++++----- 4 files changed, 73 insertions(+), 60 deletions(-) diff --git a/python/TestHarness/JobDAG.py b/python/TestHarness/JobDAG.py index 660ec2baee59..41b31bf6e064 100644 --- a/python/TestHarness/JobDAG.py +++ b/python/TestHarness/JobDAG.py @@ -151,7 +151,7 @@ def _fix_cornercases(self, prereq_job, job): def _hasDownStreamsWithFailures(self, job): """ Return True if any dependents of job has previous failures """ for d_job in self.__job_dag.all_downstreams(job): - status, message, caveats = d_job.previousTesterStatus(self.options) + status, message, caveats = d_job.previousTesterStatus() if status in d_job.job_status.getFailingStatuses(): return True @@ -160,7 +160,7 @@ def _doPreviouslyFailed(self, job): Set up statuses for jobs contained within the DAG for use with failed-tests option """ tester = job.getTester() - status, message, caveats = job.previousTesterStatus(self.options) + status, message, caveats = job.previousTesterStatus() # This job passed, but one of its dependents has not if status == tester.success and self._hasDownStreamsWithFailures(job): diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 70b117179d92..67650ab09bf4 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -802,7 +802,7 @@ def cleanup(self): # Store the results from each job for job_group in all_jobs: for job in job_group: - self.storeJobResults(job) + job.storeResults(self.scheduler) # And write the results self.writeResults(complete=True) @@ -903,6 +903,9 @@ def initializeResults(self): # Record whether or not the storage is incomplete storage['INCOMPLETE'] = True + # Empty storage for the tests + storage['TESTS'] = {} + # Write the headers self.writeResults() @@ -943,26 +946,6 @@ def writeResults(self, complete=False): print(f'\nERROR: Failed to move in progress results {file_in_progress} to {file}') raise - def storeJobResults(self, job): - """ Stores the results from a job to the results storage """ - # Nothing to store - if self.useExistingStorage(): - raise Exception('Should not store job results') - - # Get the job's result out of the thread lock - job_results = job.getResults() - - # Create empty key based on TestDir, or re-inialize with existing data so we can append to it - self.options.results_storage[job.getTestDir()] = self.options.results_storage.get(job.getTestDir(), {}) - test_dir_entry = self.options.results_storage[job.getTestDir()] - - # Initialize entry for this job - test_dir_entry[job.getTestName()] = job_results - test_dir_entry[job.getTestName()].update(self.scheduler.appendResultFileJob(job)) - - # Additional data to store (overwrites any previous matching keys) - test_dir_entry.update(job.getMetaData()) - def initialize(self, argv, app_name): # Load the scheduler plugins plugin_paths = [os.path.join(self.moose_dir, 'python', 'TestHarness'), os.path.join(self.moose_dir, 'share', 'moose', 'python', 'TestHarness')] diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index b83933ca8657..22bc1745cdcb 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -619,8 +619,8 @@ def setStatus(self, status, message=''): def createStatus(self): return self.job_status.createStatus() - def previousTesterStatus(self, options, previous_storage=None): - return self.__tester.previousTesterStatus(options, previous_storage) + def previousTesterStatus(self): + return self.__tester.previousTesterStatus(self.options) def getStatusMessage(self): return self.__job_message @@ -689,15 +689,11 @@ def getJointStatus(self): self.__tester.getStatus().code, self.__tester.getStatus().sort_value) - def getResults(self): - """ Gets the results for this job for the results storage """ + def storeResults(self, scheduler): + """ Store the results for this Job into the results storage """ status, message, message_color, _, _ = self.getJointStatus() - # Output that isn't in a file (no --sep-files) - output = self.getCombinedOutput() if not self.hasSeperateOutput() else None - # Output that is in a file (--sep-files) - output_files = self.getCombinedSeparateOutputPaths() if self.hasSeperateOutput() else None - + # Base job data job_data = {'NAME' : self.getTestNameShort(), 'LONG_NAME' : self.getTestName(), 'TIMING' : self.timer.totalTimes(), @@ -706,25 +702,34 @@ def getResults(self): 'FAIL' : self.isFail(), 'COLOR' : message_color, 'CAVEATS' : list(self.getCaveats()), - 'OUTPUT' : output, - 'OUTPUT_FILES' : output_files, 'TESTER_OUTPUT_FILES' : self.getOutputFiles(self.options), 'INPUT_FILE' : self.getInputFile(), 'COMMAND' : self.getCommand(), 'META_DATA' : self.getMetaData()} - return job_data + if self.hasSeperateOutput(): + job_data['OUTPUT_FILES'] = self.getCombinedSeparateOutputPaths() + else: + job_data['OUTPUT'] = self.getCombinedOutput() + + # Extend with data from the scheduler, if any + job_data.update(scheduler.appendResultFileJob(self)) + + # Get the entry we're loading into + test_dir_entry, test_entry = self.getTester().getResultsEntry(self.options, True) + + # Add the job data + test_entry.update(job_data) + # Additional data to store (overwrites any previous matching keys) + test_dir_entry.update(self.getMetaData()) def loadPreviousResults(self): """ Loads the previous results for this job for the results storage """ - try: - test_results = self.options.results_storage[self.getTestDir()][self.getTestName()] - except KeyError: - print(f'ERROR: {self.getTestName()} is missing in {self.options.results_file}') - sys.exit(1) + # False here means don't create it + test_dir_entry, test_entry = self.getTester().getResultsEntry(self.options, False) # Set the tester status tester = self.getTester() - status, message, caveats = self.previousTesterStatus(self.options, self.options.results_storage) + status, message, caveats = self.previousTesterStatus() tester.setStatus(status, message) if caveats: tester.addCaveats(caveats) @@ -732,17 +737,21 @@ def loadPreviousResults(self): # Set the previous times self.timer.reset() time_now = Timer.time_now() - for name, total_time in test_results['TIMING'].items(): + for name, total_time in test_entry['TIMING'].items(): self.timer.start(name, time_now) self.timer.stop(name, time_now + total_time) - # Set the previous --sep-files outputs, if any - if self.options.results_storage['SEP_FILES']: - output_paths = test_results['OUTPUT_FILES'] + # Load the output + output_files = test_entry.get('OUTPUT_FILES') + output = test_entry.get('OUTPUT') + # --sep-files + if output_files: for name, object in self.getOutputObjects().items(): - object.setSeparateOutputPath(output_paths[name]) - # Otherwise, set the previous actual outputs - else: - outputs = test_results['OUTPUT'] + object.setSeparateOutputPath(output_files[name]) + # Output stored in the result + elif output: for name, object in self.getOutputObjects().items(): - object.setOutput(outputs[name]) + object.setOutput(output[name]) + # No output?! + else: + raise Exception(f'Test {self.getTestName()} missing output') diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index 52935b0da8f3..2d1841a76b20 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -176,18 +176,39 @@ def setStatus(self, status, message=''): def createStatus(self): return self.test_status.createStatus() - # Return a tuple (status, message, caveats) for this tester as found - # in the .previous_test_results.json file (or supplied json object) - def previousTesterStatus(self, options, previous_storage=None): - if not previous_storage: - previous_storage = options.results_storage + def getResultsEntry(self, options, create, graceful=False): + """ Get the entry in the results storage for this tester """ + tests = options.results_storage['TESTS'] + + test_dir = self.getTestDir() + test_dir_entry = tests.get(test_dir) + if not test_dir_entry: + if not create: + if graceful: + return None, None + raise Exception(f'Test folder {test_dir} not in results') + tests[test_dir] = {} + test_dir_entry = tests[test_dir] + + test_name = self.getTestName() + test_name_entry = test_dir_entry.get(test_name) + if not test_name_entry: + if not create: + if graceful: + return test_dir_entry, None + raise Exception(f'Test {test_dir}/{test_name} not in results') + test_dir_entry[test_name] = {} + return test_dir_entry, test_dir_entry.get(test_name) - status_exists = previous_storage.get(self.getTestDir(), {}).get(self.getTestName(), None) + # Return a tuple (status, message, caveats) for this tester as found + # in the previous results + def previousTesterStatus(self, options): + test_dir_entry, test_entry = self.getResultsEntry(options, False, True) status = (self.test_status.createStatus(), '', '') - if status_exists: - status = (self.test_status.createStatus(str(status_exists['STATUS'])), - str(status_exists['STATUS_MESSAGE']), - status_exists['CAVEATS']) + if test_entry: + status = (self.test_status.createStatus(str(test_entry['STATUS'])), + str(test_entry['STATUS_MESSAGE']), + test_entry['CAVEATS']) return (status) def getStatusMessage(self): From 7c50ef00410d2a6f11186375bf5300dbc6bbdcbd Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Sat, 24 Aug 2024 16:24:45 -0600 Subject: [PATCH 213/243] Add more runtime information to results file --- python/TestHarness/TestHarness.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 67650ab09bf4..18f2f08c4286 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -893,6 +893,9 @@ def initializeResults(self): # Record information on the host we can ran on storage['HOSTNAME'] = socket.gethostname() storage['USER'] = os.getlogin() + storage['TESTHARNESS_PATH'] = os.path.abspath(os.path.join(os.path.abspath(__file__), '..')) + storage['TESTHARNESS_ARGS'] = sys.argv[1:] + storage['MOOSE_DIR'] = self.moose_dir # Record when the run began storage['TIME'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') From f6e41e1b919964d55852307fa216adc557aa5fd2 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Sat, 24 Aug 2024 19:44:58 -0600 Subject: [PATCH 214/243] Use getpass for username instead --- python/TestHarness/TestHarness.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 18f2f08c4286..8daf696394b4 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -17,6 +17,7 @@ import shutil import socket import datetime +import getpass from socket import gethostname from FactorySystem.Factory import Factory @@ -892,7 +893,7 @@ def initializeResults(self): # Record information on the host we can ran on storage['HOSTNAME'] = socket.gethostname() - storage['USER'] = os.getlogin() + storage['USER'] = getpass.getuser() storage['TESTHARNESS_PATH'] = os.path.abspath(os.path.join(os.path.abspath(__file__), '..')) storage['TESTHARNESS_ARGS'] = sys.argv[1:] storage['MOOSE_DIR'] = self.moose_dir From 1ba03626573871cefaa301fee491fbb341af92f7 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Sat, 24 Aug 2024 20:33:56 -0600 Subject: [PATCH 215/243] Clean up output printing, only trim runner output, only hide runner output for --sep-files --- python/TestHarness/TestHarness.py | 34 ++---- python/TestHarness/schedulers/Job.py | 149 ++++++++++++++++++--------- python/TestHarness/util.py | 40 +++---- 3 files changed, 125 insertions(+), 98 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 8daf696394b4..9dfdf84b776e 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -610,26 +610,6 @@ def checkExpectError(self, output, expect_error): else: return True - def printOutput(self, job, color): - """ Method to print a testers output to the screen """ - output = '' - # Print what ever status the tester has at the time - if self.options.verbose or (job.isFail() and not self.options.quiet) or job.isError(): - if job.getCommandRan(): - command = job.getCommandRan() - else: - command = job.getCommand() - output = 'Working Directory: ' + job.getTestDir() + '\nRunning command: ' + command + '\n' - output += util.trimOutput(job, self.options) - output = output.replace('\r', '\n') # replace the carriage returns with newlines - lines = output.split('\n') - - if output != '': - test_name = util.colorText(job.getTestName() + ": ", color, colored=self.options.colored, code=self.options.code) - output = test_name + ("\n" + test_name).join(lines) - print(output) - return output - def handleJobStatus(self, job, caveats=None): """ The Scheduler is calling back the TestHarness to inform us of a status change. @@ -640,11 +620,13 @@ def handleJobStatus(self, job, caveats=None): elif not job.isSilent(): # Print results and perform any desired post job processing if job.isFinished(): - status, message, color, status_code, sort_value = job.getJointStatus() - self.error_code = self.error_code | status_code + joint_status = job.getJointStatus() + self.error_code = self.error_code | joint_status.status_code # perform printing of application output if so desired - self.printOutput(job, color) + output = job.getOutputForScreen() + if output: + print(output) # Print status with caveats (if caveats not overridden) caveats = True if caveats is None else caveats @@ -653,7 +635,7 @@ def handleJobStatus(self, job, caveats=None): timing = job.getTiming() # Save these results for 'Final Test Result' summary - self.test_table.append( (job, sort_value, timing) ) + self.test_table.append( (job, joint_status.sort_value, timing) ) self.postRun(job.specs, timing) if job.isSkip(): @@ -1178,10 +1160,6 @@ def checkAndUpdateCLArgs(self): if opts.libmesh_dir: self.libmesh_dir = opts.libmesh_dir - # User wants to write all output, so unify the options involved - if opts.sep_files: - opts.quiet = True - def postRun(self, specs, timing): return diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 22bc1745cdcb..b651c7f22f76 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -7,7 +7,7 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html -import itertools, re, os, time, threading +import itertools, re, os, time, threading, traceback from timeit import default_timer as clock from TestHarness.StatusSystem import StatusSystem from TestHarness.FileChecker import FileChecker @@ -15,7 +15,8 @@ from TestHarness import OutputInterface, util from tempfile import TemporaryDirectory from collections import namedtuple -import traceback + +from TestHarness import util def time_now(): return time.time_ns() / (10 ** 9) @@ -138,6 +139,9 @@ class Job(OutputInterface): # Thread lock for creating output directories mkdir_lock = threading.Lock() + # Tuple for getJointStatus() + JointStatus = namedtuple('JointStatus', ['status', 'message', 'color', 'status_code', 'sort_value']) + def __init__(self, tester, job_dag, options): OutputInterface.__init__(self) @@ -520,36 +524,90 @@ def getCombinedSeparateOutputPaths(self): paths[name] = object.getSeparateOutputFilePath() if object.hasOutput() else None return paths - def getCombinedOutput(self, concatenate=False): - """ Return individual output from each object """ - output = '' if concatenate else {} + def getAllOutput(self) -> dict: + """ Get all output in a dict from each object to the text output """ + output = {} + for name, object in self.getOutputObjects().items(): + output[name] = object.getOutput() + object_output = object.getOutput() + + def getOutputForScreen(self): + """ Gets the output for printing on screen """ + show_output = self.options.verbose or (self.isFail() and not self.options.quiet) or self.isError() + if not show_output: + return None + + if self.getCommandRan(): + command = self.getCommandRan() + else: + command = self.getCommand() + + output = 'Working Directory: ' + self.getTestDir() + '\nRunning command: ' + command + '\n' + + # Whether or not to skip the runner_run output, which is the output from the + # actual run (the process that the harness runs) + skip_runner_run = None + if self.options.sep_files and not self.options.verbose: + skip_runner_run = '--sep-files' + + options = self.options + specs = self.specs + for name, object in self.getOutputObjects().items(): object_output = object.getOutput() - if object_output: - wrapped_object_output = '' - - if concatenate: - # Add a complete line break between objects - if output: - wrapped_object_output += '\n' - # Add a header before the output starts - wrapped_object_output += util.outputHeader(f'Begin {name} output', ending=False) + '\n' - - # Add the actual output - wrapped_object_output += object_output - - if concatenate: - # Add a newline if one is missing - if wrapped_object_output[-1] != '\n': - wrapped_object_output += '\n' - # Add a footer after the output ends - wrapped_object_output += '\n' + util.outputHeader(f'End {name} output', ending=False) - - output += wrapped_object_output - else: - output[name] = wrapped_object_output - elif not concatenate: - output[name] = None + + # Nothing to output + if not object_output: + continue + + # Max size of this output for trimming + # Currently only used for the runner_run output + max_size = None + + # Possibly trim or skip the runner_run output (actual process output) + if name == 'runner_run': + # Don't output the runner run + if skip_runner_run: + output += f'\nSkipping runner_run output due to {skip_runner_run}\n' + continue + + # Default trimmed output size + max_size = 1000 + # max_buffer_size is set + if specs.isValid('max_buffer_size'): + # ...to the max + if specs['max_buffer_size'] == -1: + max_size = None + # ... or to a value + else: + max_size = int(specs['max_buffer_size']) + # Disable trimmed output + if options.no_trimmed_output: + max_size = None + # Don't trim output on error, and we errored + if options.no_trimmed_output_on_error and self.isFail(): + max_size = None + + # Add a complete line break between objects + if output: + output += '\n' + # Add a header before the output starts + output += util.outputHeader(f'Begin {name} output', ending=False) + '\n' + # Add the output, trimming if needed + output += util.trimOutput(object_output, max_size=max_size) + # Add a newline if one is missing + if output[-1] != '\n': + output += '\n' + # Add a footer after the output ends + output += '\n' + util.outputHeader(f'End {name} output', ending=False) + + # Add the text name prefix + if output: + lines = output.split('\n') + joint_status = self.getJointStatus() + prefix = util.colorText(self.getTestName() + ': ', joint_status.color, + colored=self.options.colored, code=self.options.code) + output = prefix + ('\n' + prefix).join(lines) return output @@ -675,32 +733,31 @@ def getJointStatus(self): """ # Job has failed, or tester has no status if self.isError() or self.isNoStatus(): - return (self.getStatus().status, - self.getStatusMessage(), - self.getStatus().color, - self.getStatus().code, - self.getStatus().sort_value) + return Job.JointStatus(status=self.getStatus().status, + message=self.getStatusMessage(), + color=self.getStatus().color, + status_code=self.getStatus().code, + sort_value=self.getStatus().sort_value) # Tester has a finished status of some sort - else: - return (self.__tester.getStatus().status, - self.__tester.getStatusMessage(), - self.__tester.getStatus().color, - self.__tester.getStatus().code, - self.__tester.getStatus().sort_value) + return Job.JointStatus(status=self.__tester.getStatus().status, + message=self.__tester.getStatusMessage(), + color=self.__tester.getStatus().color, + status_code=self.__tester.getStatus().code, + sort_value=self.__tester.getStatus().sort_value) def storeResults(self, scheduler): """ Store the results for this Job into the results storage """ - status, message, message_color, _, _ = self.getJointStatus() + joint_status = self.getJointStatus() # Base job data job_data = {'NAME' : self.getTestNameShort(), 'LONG_NAME' : self.getTestName(), 'TIMING' : self.timer.totalTimes(), - 'STATUS' : status, - 'STATUS_MESSAGE' : message, + 'STATUS' : joint_status.status, + 'STATUS_MESSAGE' : joint_status.message, 'FAIL' : self.isFail(), - 'COLOR' : message_color, + 'COLOR' : joint_status.color, 'CAVEATS' : list(self.getCaveats()), 'TESTER_OUTPUT_FILES' : self.getOutputFiles(self.options), 'INPUT_FILE' : self.getInputFile(), @@ -709,7 +766,7 @@ def storeResults(self, scheduler): if self.hasSeperateOutput(): job_data['OUTPUT_FILES'] = self.getCombinedSeparateOutputPaths() else: - job_data['OUTPUT'] = self.getCombinedOutput() + job_data['OUTPUT'] = self.getAllOutput() # Extend with data from the scheduler, if any job_data.update(scheduler.appendResultFileJob(self)) diff --git a/python/TestHarness/util.py b/python/TestHarness/util.py index 5e8201430845..a00ad0e6a8d4 100644 --- a/python/TestHarness/util.py +++ b/python/TestHarness/util.py @@ -260,7 +260,7 @@ def formatStatusMessage(job, status, message, options): def formatResult(job, options, result='', color=True, **kwargs): # Support only one instance of a format identifier, but obey the order terminal_format = list(OrderedDict.fromkeys(list(options.term_format))) - status, message, message_color, exit_code, sort_value = job.getJointStatus() + joint_status = job.getJointStatus() color_opts = {'code' : options.code, 'colored' : options.colored} @@ -282,18 +282,18 @@ def formatResult(job, options, result='', color=True, **kwargs): justification_index = terminal_format[i] if str(f_key).lower() == 'p': - pre_result = ' '*(8-len(status)) + status - formatCase(f_key, (pre_result, message_color), formatted_results) + pre_result = ' '*(8-len(joint_status.status)) + joint_status.status + formatCase(f_key, (pre_result, joint_status.color), formatted_results) if str(f_key).lower() == 's': if not result: - result = formatStatusMessage(job, status, message, options) + result = formatStatusMessage(job, joint_status.status, joint_status.message, options) # refrain from printing a duplicate pre_result if it will match result - if 'p' in [x.lower() for x in terminal_format] and result == status: + if 'p' in [x.lower() for x in terminal_format] and result == joint_status.status: formatCase(f_key, None, formatted_results) else: - formatCase(f_key, (result, message_color), formatted_results) + formatCase(f_key, (result, joint_status.color), formatted_results) if str(f_key).lower() == 'n': formatCase(f_key, (job.getTestName(), None), formatted_results) @@ -310,7 +310,7 @@ def formatResult(job, options, result='', color=True, **kwargs): # Decorate Caveats if job.getCaveats() and caveat_index is not None and 'caveats' in kwargs and kwargs['caveats']: caveats = ','.join(job.getCaveats()) - caveat_color = message_color + caveat_color = joint_status.color if not job.isFail(): caveat_color = 'CYAN' @@ -869,27 +869,19 @@ def deleteFilesAndFolders(test_dir, paths, delete_folders=True): # TL;DR; Just pass... pass -# Trimming routines for job output -def trimOutput(job, options): - output = job.getCombinedOutput(concatenate=True) - if ((job.isFail() and options.no_trimmed_output_on_error) - or (job.specs.isValid('max_buffer_size') and job.specs['max_buffer_size'] == -1) - or options.no_trimmed_output): - return output - elif job.specs.isValid('max_buffer_size'): - max_size = int(job.specs['max_buffer_size']) - else: - max_size = 100000 - - if len(output) <= max_size: +def trimOutput(output, max_size=None): + """ Trims the output given some max size """ + if not max_size or len(output) < max_size or not output: return output first_part = int(max_size*(2.0/3.0)) second_part = int(max_size*(1.0/3.0)) - return "%s\n%s\n\nOutput trimmed\n\n%s\n%s" % (output[:first_part], - "#"*80, - "#"*80, - output[-second_part:]) + trimmed = f'{output[:first_part]}' + if trimmed[-1] != '\n': + trimmed += '\n' + sep = "#" * 80 + trimmed += f'\n{sep}\nOutput trimmed\n{sep}\n{output[-second_part:]}' + return trimmed def outputHeader(header, ending=True): """ From 04666c58ca9cbb5a682f92135d0c8c6f9bfd86e9 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Sat, 24 Aug 2024 20:34:07 -0600 Subject: [PATCH 216/243] Simplify previous result load --- python/TestHarness/schedulers/Job.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index b651c7f22f76..9d2fdc72806f 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -801,14 +801,10 @@ def loadPreviousResults(self): # Load the output output_files = test_entry.get('OUTPUT_FILES') output = test_entry.get('OUTPUT') - # --sep-files - if output_files: - for name, object in self.getOutputObjects().items(): + for name, object in self.getOutputObjects().items(): + if output_files: # --sep-files object.setSeparateOutputPath(output_files[name]) - # Output stored in the result - elif output: - for name, object in self.getOutputObjects().items(): + elif output: # stored in result object.setOutput(output[name]) - # No output?! - else: - raise Exception(f'Test {self.getTestName()} missing output') + else: + raise Exception(f'Test {self.getTestName()} missing output') From 6780b1feace220fbe8a606f0dfd8f477fb37462b Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Sat, 24 Aug 2024 21:45:21 -0600 Subject: [PATCH 217/243] Remove unused method --- python/TestHarness/runners/HPCRunner.py | 60 ------------------------- 1 file changed, 60 deletions(-) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index dd162763880a..72111af3be01 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -258,63 +258,3 @@ def getLastLine(file): pos = f.tell() line = f.readline().decode('utf-8') return line, pos - - @staticmethod - def readTruncated(file, start_lines=500, end_lines=500): - """ - Reads a file and truncates it past a certain amount of lines. - """ - with open(file, 'rb') as f: - # Find the end position of the file so that we don't read past - f.seek(0, os.SEEK_END) - total_bytes = f.tell() - - # Read the set of lines - f.seek(0) - head_lines_read = 0 - head = '' - while head_lines_read < start_lines and f.tell() < total_bytes: - head += f.read(1).decode('utf-8') - if len(head) > 1 and head[-1:] == '\n': - head_lines_read += 1 - - # Keep the end of the head position so that we don't read - # backwards past it for the tail - head_pos = f.tell() - - # Seek to the end and start reading ending lines - f.seek(0, os.SEEK_END) - - # Keep reading the ending lines until we've reached the max - # number of lines we want or have reached the head output - tail_lines_read = 0 - tail = [] - while tail_lines_read < end_lines and f.tell() > head_pos: - # Read each character in the line until we reach - # the beginning or a new line - line = [] - while f.tell() > 1: - f.seek(-2, os.SEEK_CUR) - char = f.read(1).decode('utf-8') - if char == '\n' or f.tell() == 0: - break - line.append(char) - - # Append the new read line - line.reverse() - tail.append(''.join(line)) - tail_lines_read += 1 - - # Whether or not we have truncated output - # (have hit the location of the head output) - truncated = f.tell() != head_pos - - # Form the combined output - output = head - if truncated: - output += util.outputHeader('OUTPUT TRIMMED') - if tail: - tail.reverse() - output += '\n'.join(tail) - - return output From ad18f9e463d415854ed83e778fa6e1cb241f3326 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Sat, 24 Aug 2024 21:47:45 -0600 Subject: [PATCH 218/243] Remove pbs test --- python/TestHarness/tests/test_PBS.py | 56 ---------------------------- python/TestHarness/tests/tests | 6 --- 2 files changed, 62 deletions(-) delete mode 100644 python/TestHarness/tests/test_PBS.py diff --git a/python/TestHarness/tests/test_PBS.py b/python/TestHarness/tests/test_PBS.py deleted file mode 100644 index 9710fc63aa9a..000000000000 --- a/python/TestHarness/tests/test_PBS.py +++ /dev/null @@ -1,56 +0,0 @@ -#* This file is part of the MOOSE framework -#* https://www.mooseframework.org -#* -#* All rights reserved, see COPYRIGHT for full restrictions -#* https://github.com/idaholab/moose/blob/master/COPYRIGHT -#* -#* Licensed under LGPL 2.1, please see LICENSE for details -#* https://www.gnu.org/licenses/lgpl-2.1.html - -import subprocess, unittest, os -from TestHarnessTestCase import TestHarnessTestCase - -def checkQstat(): - try: - if subprocess.call(['qstat']) == 0: - return True - except: - pass - -@unittest.skipIf(checkQstat() != True, "PBS not available") -class TestHarnessTester(TestHarnessTestCase): - """ - Test general PBS functionality. There are some caveats however: - - We cannot test the output of specific test. Only the initial launch return code. This - is because launching qsub is a background process, and we have no idea when that job - is finished. Or if it even began (perhaps the job is queued). - """ - def setUp(self): - """ - setUp occurs before every test. Clean up previous results file - """ - pbs_results_file = os.path.join(os.getenv('MOOSE_DIR'), 'test', '_testPBS') - - # File will not exist on the first run - try: - os.remove(pbs_results_file) - except: - pass - - def testPBSQueue(self): - """ - Test argument '--pbs-queue does-not-exist' fails, as this queue should not exist - """ - with self.assertRaises(subprocess.CalledProcessError) as cm: - self.runTests('--pbs-queue', 'does-not-exist', '--pbs', '_testPBS', '-i', 'always_ok') - - e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'ERROR: qsub: Unknown queue') - - def testPBSLaunch(self): - """ - Test general launch command - """ - output = self.runTests('--pbs', '_testPBS', '-i', 'always_ok').decode('utf-8') - self.assertNotIn('LAUNCHED', output) diff --git a/python/TestHarness/tests/tests b/python/TestHarness/tests/tests index 68689f62b1b0..69226d40ecf5 100644 --- a/python/TestHarness/tests/tests +++ b/python/TestHarness/tests/tests @@ -183,12 +183,6 @@ requirement = "The system shall be able to perform recovery of a test" issues = '#11492' [] - [pbs_tests] - type = PythonUnitTest - input = test_PBS.py - requirement = "The system shall be able to submit jobs to a PBS third party scheduler" - issues = '#12138' - [] [trim_output] type = PythonUnitTest input = test_TrimOutput.py From 9ecaaae1a2c8d66f78e8b32fd1624eee303e2eb9 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Sat, 24 Aug 2024 21:48:55 -0600 Subject: [PATCH 219/243] Revert to previous size --- python/TestHarness/schedulers/Job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 9d2fdc72806f..e7fdc852361b 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -572,7 +572,7 @@ def getOutputForScreen(self): continue # Default trimmed output size - max_size = 1000 + max_size = 100000 # max_buffer_size is set if specs.isValid('max_buffer_size'): # ...to the max From 782d396afb09469c03676e3ddbac0e181b5a604d Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Sat, 24 Aug 2024 22:04:30 -0600 Subject: [PATCH 220/243] Correct timing --- python/TestHarness/TestHarness.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 9dfdf84b776e..28ce3f2fd63a 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -666,7 +666,7 @@ def cleanup(self): for (job, sort_value, timing) in sorted(self.test_table, key=lambda x: x[1]): print((util.formatResult(job, self.options, caveats=True))) - time = datetime.datetime.now() - self.start_time + time = (datetime.datetime.now() - self.start_time).total_seconds() print(('-' * (self.options.term_cols))) @@ -693,7 +693,7 @@ def cleanup(self): else: timing_max = 0 timing_avg = 0 - summary = f'Ran {self.num_passed + self.num_failed} tests in {time.total_seconds():.1f} seconds.' + summary = f'Ran {self.num_passed + self.num_failed} tests in {time:.1f} seconds.' summary += f' Average test time {timing_avg:.1f} seconds,' summary += f' maximum test time {timing_max:.1f} seconds.' print(summary) From 2a56aa0e1e75fc052f7e575c5b1d25b5eaf4e1a9 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Sat, 24 Aug 2024 22:15:04 -0600 Subject: [PATCH 221/243] Add missing return --- python/TestHarness/schedulers/Job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index e7fdc852361b..1ed997e7645e 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -529,7 +529,7 @@ def getAllOutput(self) -> dict: output = {} for name, object in self.getOutputObjects().items(): output[name] = object.getOutput() - object_output = object.getOutput() + return output def getOutputForScreen(self): """ Gets the output for printing on screen """ From 7c0f126adcbccd21a2a4fe73ae9bf07d7a88cfb1 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Sat, 24 Aug 2024 23:26:42 -0600 Subject: [PATCH 222/243] Remove meta data and dirty files; they were needed for the queue manager --- python/TestHarness/schedulers/Job.py | 23 +------------------- python/TestHarness/schedulers/RunHPC.py | 3 --- python/TestHarness/schedulers/RunParallel.py | 9 -------- 3 files changed, 1 insertion(+), 34 deletions(-) diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 1ed997e7645e..db3e7e9d0928 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -155,7 +155,6 @@ def __init__(self, tester, job_dag, options): self.__joined_out = '' self.report_timer = None self.__slots = None - self.__meta_data = {} # Create a fileChecker object to be able to call filecheck methods self.fileChecker = FileChecker(self.options.input_file_name) @@ -339,23 +338,6 @@ def getUniquePrereqs(self): unique_prereqs.append(os.path.join(self.getTestDir(), prereq)) return unique_prereqs - def addDirtyFiles(self, files: list): - """ Adds the given files as dirty for this job """ - dirty_files = self.getMetaData().get('DIRTY_FILES', []) - for file in files: - if file not in dirty_files: - dirty_files.append(file) - self.addMetaData(DIRTY_FILES=dirty_files) - - def addMetaData(self, **kwargs): - """ Allow derived methods to store additional data which ends up in the data storage file """ - for key, value in kwargs.items(): - self.__meta_data[key] = value - - def getMetaData(self): - """ return data stored by addMetaData """ - return self.__meta_data - def getSlots(self): """ Return the number of slots this job consumes """ if self.__slots == None: @@ -761,8 +743,7 @@ def storeResults(self, scheduler): 'CAVEATS' : list(self.getCaveats()), 'TESTER_OUTPUT_FILES' : self.getOutputFiles(self.options), 'INPUT_FILE' : self.getInputFile(), - 'COMMAND' : self.getCommand(), - 'META_DATA' : self.getMetaData()} + 'COMMAND' : self.getCommand()} if self.hasSeperateOutput(): job_data['OUTPUT_FILES'] = self.getCombinedSeparateOutputPaths() else: @@ -776,8 +757,6 @@ def storeResults(self, scheduler): # Add the job data test_entry.update(job_data) - # Additional data to store (overwrites any previous matching keys) - test_dir_entry.update(self.getMetaData()) def loadPreviousResults(self): """ Loads the previous results for this job for the results storage """ diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 72ab7867e8e7..db03635446c0 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -432,9 +432,6 @@ def submitJob(self, job, hold): # Write the script open(submission_script, 'w').write(script) - # Add our output to dirty files - job.addDirtyFiles([submission_script, output_file, result_file]) - # Submission command. Here we have a simple bash loop # that will try to wait for the file if it doesn't exist yet submission_command = self.getHPCSubmissionCommand() diff --git a/python/TestHarness/schedulers/RunParallel.py b/python/TestHarness/schedulers/RunParallel.py index 79bedd8feeeb..f72fc6051d21 100644 --- a/python/TestHarness/schedulers/RunParallel.py +++ b/python/TestHarness/schedulers/RunParallel.py @@ -58,15 +58,6 @@ def run(self, job): job.appendOutput(util.outputHeader('Python exception encountered in Job') + trace) job.setStatus(job.error, 'JOB EXCEPTION') - # Add the separate output as dirty if we have any - dirty_files = [] - for object in job.getOutputObjects().values(): - if object.hasOutput(): - output_file = object.getSeparateOutputFilePath() - if output_file: - dirty_files.append(output_file) - job.addDirtyFiles(dirty_files) - def buildRunner(self, job, options) -> Runner: """Builds the runner for a given tester From bc14d787d0543e80ef78cb9c761ee7d899e3199f Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Sat, 24 Aug 2024 23:25:01 -0600 Subject: [PATCH 223/243] Fix and cleanup test harness unit tests --- .../TestHarness/tests/TestHarnessTestCase.py | 20 +++-- python/TestHarness/tests/test_Allocations.py | 20 ++--- .../tests/test_ArbitrarySpecFile.py | 6 +- python/TestHarness/tests/test_CSVDiffs.py | 10 +-- .../tests/test_CSVValidationTester.py | 8 +- python/TestHarness/tests/test_CustomEval.py | 4 +- python/TestHarness/tests/test_Cyclic.py | 4 +- python/TestHarness/tests/test_Deleted.py | 4 +- .../TestHarness/tests/test_DependencySkip.py | 2 +- python/TestHarness/tests/test_Diff.py | 8 +- python/TestHarness/tests/test_DiffGold.py | 8 +- .../TestHarness/tests/test_DisplayRequired.py | 2 +- .../TestHarness/tests/test_DistributedMesh.py | 4 +- python/TestHarness/tests/test_DoLast.py | 10 +-- python/TestHarness/tests/test_DryRun.py | 10 +-- python/TestHarness/tests/test_Duplicate.py | 10 +-- python/TestHarness/tests/test_Expect.py | 14 +-- python/TestHarness/tests/test_ExtraInfo.py | 2 +- python/TestHarness/tests/test_FailedTests.py | 28 +++--- python/TestHarness/tests/test_Ignore.py | 42 ++++----- python/TestHarness/tests/test_LongRunning.py | 2 +- python/TestHarness/tests/test_LongestJobs.py | 4 +- python/TestHarness/tests/test_MinADSize.py | 2 +- python/TestHarness/tests/test_MissingGold.py | 4 +- python/TestHarness/tests/test_ParserErrors.py | 2 +- .../TestHarness/tests/test_PythonVersion.py | 2 +- .../TestHarness/tests/test_RaceConditions.py | 2 +- python/TestHarness/tests/test_Recover.py | 6 +- python/TestHarness/tests/test_Replay.py | 51 ++++++----- .../TestHarness/tests/test_ReportSkipped.py | 4 +- python/TestHarness/tests/test_RequiredApps.py | 6 +- .../TestHarness/tests/test_RequiredObjects.py | 6 +- python/TestHarness/tests/test_SchemaDiff.py | 8 +- .../TestHarness/tests/test_ShouldExecute.py | 6 +- .../tests/test_SoftHeavyDependency.py | 64 +++++++------- python/TestHarness/tests/test_Syntax.py | 10 +-- python/TestHarness/tests/test_Timeout.py | 4 +- python/TestHarness/tests/test_TrimOutput.py | 6 +- python/TestHarness/tests/test_UnknownParam.py | 2 +- .../TestHarness/tests/test_UnknownPrereq.py | 2 +- .../tests/test_UnreadableOutput.py | 2 +- .../tests/test_WorkingDirectory.py | 14 +-- python/TestHarness/tests/test_WriteResults.py | 86 +++++++++---------- 43 files changed, 265 insertions(+), 246 deletions(-) diff --git a/python/TestHarness/tests/TestHarnessTestCase.py b/python/TestHarness/tests/TestHarnessTestCase.py index 4fbfac2679b5..456a60264be1 100644 --- a/python/TestHarness/tests/TestHarnessTestCase.py +++ b/python/TestHarness/tests/TestHarnessTestCase.py @@ -10,6 +10,7 @@ import os import unittest import subprocess +import tempfile import re class TestHarnessTestCase(unittest.TestCase): @@ -17,17 +18,22 @@ class TestHarnessTestCase(unittest.TestCase): TestCase class for running TestHarness commands. """ - def runExceptionTests(self, *args): + def runTests(self, *args, tmp_output=True): cmd = ['./run_tests'] + list(args) + ['--term-format', 'njCst'] + sp_kwargs = {'cwd': os.path.join(os.getenv('MOOSE_DIR'), 'test'), + 'text': True} + if tmp_output: + with tempfile.TemporaryDirectory() as output_dir: + cmd += ['-o', output_dir] + return subprocess.check_output(cmd, **sp_kwargs) + return subprocess.check_output(cmd, **sp_kwargs) + + def runExceptionTests(self, *args, tmp_output=True): try: - return subprocess.check_output(cmd, cwd=os.path.join(os.getenv('MOOSE_DIR'), 'test')) - raise RuntimeError('test failed to fail') + self.runTests(*args, tmp_output=tmp_output) except Exception as err: return err.output - - def runTests(self, *args): - cmd = ['./run_tests'] + list(args) + ['--term-format', 'njCst'] - return subprocess.check_output(cmd, cwd=os.path.join(os.getenv('MOOSE_DIR'), 'test')) + raise RuntimeError('test failed to fail') def checkStatus(self, output, passed=0, skipped=0, pending=0, failed=0): """ diff --git a/python/TestHarness/tests/test_Allocations.py b/python/TestHarness/tests/test_Allocations.py index 8cfd19d2a664..6b950b3e3113 100644 --- a/python/TestHarness/tests/test_Allocations.py +++ b/python/TestHarness/tests/test_Allocations.py @@ -17,15 +17,15 @@ def testSkippedAllocations(self): """ # Subject a normally passing test to impossible cpu allocations output = self.runTests('--no-color', '-i', 'always_ok', '-p', '2', '-j', '1') - self.assertRegex(output.decode('utf-8'), 'tests/test_harness.always_ok.*? \[INSUFFICIENT SLOTS\] SKIP') + self.assertRegex(output, 'tests/test_harness.always_ok.*? \[INSUFFICIENT SLOTS\] SKIP') # Subject a normally passing test to impossible thread allocations output = self.runTests('--no-color', '-i', 'always_ok', '--n-threads', '2', '-j', '1') - self.assertRegex(output.decode('utf-8'), 'tests/test_harness.always_ok.*? \[INSUFFICIENT SLOTS\] SKIP') + self.assertRegex(output, 'tests/test_harness.always_ok.*? \[INSUFFICIENT SLOTS\] SKIP') # A combination of threads*cpus with too low a hard limit (3*3= -j9) output = self.runTests('--no-color', '-i', 'allocation_test', '--n-threads', '3', '-p', '3', '-j', '8') - self.assertRegex(output.decode('utf-8'), 'tests/test_harness.allocation_test.*? \[INSUFFICIENT SLOTS\] SKIP') + self.assertRegex(output, 'tests/test_harness.allocation_test.*? \[INSUFFICIENT SLOTS\] SKIP') def testOversizedCaveat(self): """ @@ -33,13 +33,13 @@ def testOversizedCaveat(self): """ # A test which has no min/max cpu parameters should print oversized # when subjected to -p 2 - output = self.runTests('-i', 'always_ok', '-p', '2').decode('utf-8') + output = self.runTests('-i', 'always_ok', '-p', '2') self.assertNotIn('CPUS', output) self.assertIn('OVERSIZED', output) # A test which has no min/max thread parameters should print oversized # when subjected to --n-threads 2 - output = self.runTests('-i', 'always_ok', '--n-threads', '2').decode('utf-8') + output = self.runTests('-i', 'always_ok', '--n-threads', '2') self.assertNotIn('THREADS', output) self.assertIn('OVERSIZED', output) @@ -51,14 +51,14 @@ def testCpuCaveats(self): """ # Test MIN CPUs / Oversized caveat using soft limit (no -j) on a test # having a minimum cpu parameter of 2. - output = self.runTests('-i', 'allocation_test', '--n-threads', '2').decode('utf-8') + output = self.runTests('-i', 'allocation_test', '--n-threads', '2') self.assertNotIn('MIN_THREADS', output) self.assertIn('MIN_CPUS=2', output) self.assertIn('OVERSIZED', output) # Test MAX CPUs / Oversized caveat on a test having a maximum cpu # parameter of 3 (and we subjected it to 4). - output = self.runTests('-i', 'allocation_test', '-p', '4', '--n-threads', '2').decode('utf-8') + output = self.runTests('-i', 'allocation_test', '-p', '4', '--n-threads', '2') self.assertNotIn('MIN_THREADS', output) self.assertIn('MAX_CPUS=3', output) self.assertIn('OVERSIZED', output) @@ -73,7 +73,7 @@ def testThreadCaveats(self): # Note: 1*2 should be -j 2 but the test minimum is 2 threads, so we need # to use -j 4 to suppress any cpu caveats. Oversized will not trigger as # -j4 satisfies this test's requirements. - output = self.runTests('-i', 'allocation_test', '-j', '4', '-p', '2', '--n-threads', '1').decode('utf-8') + output = self.runTests('-i', 'allocation_test', '-j', '4', '-p', '2', '--n-threads', '1') self.assertNotIn('CPUS', output) self.assertNotIn('OVERSIZED', output) self.assertIn('MIN_THREADS=2', output) @@ -83,7 +83,7 @@ def testThreadCaveats(self): # are specifically testing that setting a lower j does _not_ trigger an # insufficient skipped test scenario. Oversized will not trigger as # -j6 satisfies this test's requirements. - output = self.runTests('-i', 'allocation_test', '-j', '6', '-p', '2', '--n-threads', '4').decode('utf-8') + output = self.runTests('-i', 'allocation_test', '-j', '6', '-p', '2', '--n-threads', '4') self.assertNotIn('CPUS', output) self.assertNotIn('OVERSIZED', output) self.assertIn('MAX_THREADS=3', output) @@ -94,7 +94,7 @@ def testPerfectAllocation(self): """ # Passing test triggering no caveats, as supplied allocations satisfies # the test's requirements - output = self.runTests('-i', 'allocation_test', '-j', '4', '-p', '2', '--n-threads', '2').decode('utf-8') + output = self.runTests('-i', 'allocation_test', '-j', '4', '-p', '2', '--n-threads', '2') self.assertNotIn('MIN_THREADS', output) self.assertNotIn('MAX_THREADS', output) self.assertNotIn('MIN_CPUS', output) diff --git a/python/TestHarness/tests/test_ArbitrarySpecFile.py b/python/TestHarness/tests/test_ArbitrarySpecFile.py index 7003fa6308d5..551d35481f52 100644 --- a/python/TestHarness/tests/test_ArbitrarySpecFile.py +++ b/python/TestHarness/tests/test_ArbitrarySpecFile.py @@ -15,16 +15,16 @@ def testArbitrarySpecFile(self): Verify an arbitrary test will run when we use the --spec-file argument """ # Test that we do not recursively find additional tests - output = self.runTests('--spec-file', 'tests/test_harness/arbitrary_test').decode('utf-8') + output = self.runTests('--spec-file', 'tests/test_harness/arbitrary_test') self.assertIn('tests/test_harness.always_ok', output) self.assertNotIn('tests/test_harness/arbitrary_directory.always_ok', output) # Test that we do find additional tests with recursion - output = self.runTests('--spec-file', 'tests/test_harness', '-i', 'arbitrary_test').decode('utf-8') + output = self.runTests('--spec-file', 'tests/test_harness', '-i', 'arbitrary_test') self.assertIn('tests/test_harness.always_ok', output) self.assertIn('tests/test_harness/arbitrary_directory.always_ok', output) # Test that we are not recursively finding our way backwards - output = self.runTests('--spec-file', 'tests/test_harness/arbitrary_directory', '-i', 'arbitrary_test').decode('utf-8') + output = self.runTests('--spec-file', 'tests/test_harness/arbitrary_directory', '-i', 'arbitrary_test') self.assertIn('tests/test_harness/arbitrary_directory.always_ok', output) self.assertNotIn('tests/test_harness.always_ok', output) diff --git a/python/TestHarness/tests/test_CSVDiffs.py b/python/TestHarness/tests/test_CSVDiffs.py index 5da059689d63..b1ca2df33bb2 100644 --- a/python/TestHarness/tests/test_CSVDiffs.py +++ b/python/TestHarness/tests/test_CSVDiffs.py @@ -19,9 +19,9 @@ def testDiffs(self): self.runTests('-i', 'csvdiffs') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.test_csvdiff.*?FAILED \(Override inputs not the same length\)') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.test_badfile.*?FAILED \(MISSING GOLD FILE\)') - self.checkStatus(e.output.decode('utf-8'), failed=2) + self.assertRegex(e.output, r'test_harness\.test_csvdiff.*?FAILED \(Override inputs not the same length\)') + self.assertRegex(e.output, r'test_harness\.test_badfile.*?FAILED \(MISSING GOLD FILE\)') + self.checkStatus(e.output, failed=2) def testMissingComparison(self): """ @@ -31,8 +31,8 @@ def testMissingComparison(self): self.runTests('-i', 'csvdiff_missing_comparison_file') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.test_csvdiff_comparison_file_missing.*?FAILED \(MISSING COMPARISON FILE\)') - self.checkStatus(e.output.decode('utf-8'), failed=1) + self.assertRegex(e.output, r'test_harness\.test_csvdiff_comparison_file_missing.*?FAILED \(MISSING COMPARISON FILE\)') + self.checkStatus(e.output, failed=1) def testCSVDiffScript(self): """ diff --git a/python/TestHarness/tests/test_CSVValidationTester.py b/python/TestHarness/tests/test_CSVValidationTester.py index ae009009eed8..004a3afe2cde 100644 --- a/python/TestHarness/tests/test_CSVValidationTester.py +++ b/python/TestHarness/tests/test_CSVValidationTester.py @@ -20,10 +20,10 @@ def testCSVValidationTester(self): """ with self.assertRaises(subprocess.CalledProcessError) as cm: - self.runTests('-i', 'csv_validation_tester', '--no-color').decode('utf-8') + self.runTests('-i', 'csv_validation_tester', '--no-color') e = cm.exception - output = e.output.decode('utf-8') + output = e.output self.assertRegexpMatches(output, r'test_harness\.csv_validation_tester_01.*?OK') self.assertRegexpMatches(output, r'test_harness\.csv_validation_tester_02.*?FAILED \(DIFF\)') @@ -34,9 +34,9 @@ def testCSVValidationTesterVerbose(self): """ with self.assertRaises(subprocess.CalledProcessError) as cm: - self.runTests('-i', 'csv_validation_tester', '--verbose', '--no-color').decode('utf-8') + self.runTests('-i', 'csv_validation_tester', '--verbose', '--no-color') e = cm.exception - output = e.output.decode('utf-8') + output = e.output self.assertRegexpMatches(output, 'csv_validation_tester_01.csv | 0.00 \xb1 0.01 | 0.01 \xb1 0.01') self.assertRegexpMatches(output, 'csv_validation_tester_02.csv | 0.00 \xb1 0.01 | 0.01 \xb1 0.00') diff --git a/python/TestHarness/tests/test_CustomEval.py b/python/TestHarness/tests/test_CustomEval.py index ffb411b8cbd0..8aabcf29e867 100644 --- a/python/TestHarness/tests/test_CustomEval.py +++ b/python/TestHarness/tests/test_CustomEval.py @@ -6,10 +6,10 @@ def testCustomEval(self): with self.assertRaises(subprocess.CalledProcessError) as cm: self.runTests('-i', 'custom_eval') e = cm.exception - self.assertIn('Custom evaluation failed', e.output.decode('utf-8')) + self.assertIn('Custom evaluation failed', e.output) #test expect out failure with self.assertRaises(subprocess.CalledProcessError) as cm: self.runTests('-i', 'custom_eval') e = cm.exception - self.assertIn('expect_out and absent_out can not be supplied', e.output.decode('utf-8')) + self.assertIn('expect_out and absent_out can not be supplied', e.output) diff --git a/python/TestHarness/tests/test_Cyclic.py b/python/TestHarness/tests/test_Cyclic.py index 0a4270d976f1..9134a9b133f2 100644 --- a/python/TestHarness/tests/test_Cyclic.py +++ b/python/TestHarness/tests/test_Cyclic.py @@ -19,5 +19,5 @@ def testCyclic(self): self.runTests('--no-color', '-i', 'cyclic_tests') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.testC.*? FAILED \(Cyclic or Invalid Dependency Detected!\)') - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.test[A|B].*? \[SKIPPED DEPENDENCY\] SKIP') + self.assertRegex(e.output, r'tests/test_harness.testC.*? FAILED \(Cyclic or Invalid Dependency Detected!\)') + self.assertRegex(e.output, r'tests/test_harness.test[A|B].*? \[SKIPPED DEPENDENCY\] SKIP') diff --git a/python/TestHarness/tests/test_Deleted.py b/python/TestHarness/tests/test_Deleted.py index 1012d58f8676..01e09c927ba1 100644 --- a/python/TestHarness/tests/test_Deleted.py +++ b/python/TestHarness/tests/test_Deleted.py @@ -19,7 +19,7 @@ def testDeleted(self): self.runTests('--no-color', '-i', 'deleted', '-e') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.deleted.*? \[TEST DELETED TEST\] FAILED \(DELETED\)') + self.assertRegex(e.output, r'test_harness\.deleted.*? \[TEST DELETED TEST\] FAILED \(DELETED\)') # Verify return code is DELETED related (0x83) self.assertIs(0x83, e.returncode) @@ -28,5 +28,5 @@ def testNoExtraInfo(self): """ Test that deleted tests do not run without -e (extra) option """ - output = self.runTests('--no-color', '-i', 'deleted').decode('utf-8') + output = self.runTests('--no-color', '-i', 'deleted') self.assertNotIn('tests/test_harness.deleted', output) diff --git a/python/TestHarness/tests/test_DependencySkip.py b/python/TestHarness/tests/test_DependencySkip.py index c1cc7ec0c094..b7b33f1ee3f6 100644 --- a/python/TestHarness/tests/test_DependencySkip.py +++ b/python/TestHarness/tests/test_DependencySkip.py @@ -13,6 +13,6 @@ def testDependencySkip(self): """ Test skipping a test if its prereq is also skipped """ - output = self.runTests('--no-color', '-i', 'depend_skip_tests').decode('utf-8') + output = self.runTests('--no-color', '-i', 'depend_skip_tests') self.assertIn('[ALWAYS SKIPPED] SKIP', output) self.assertIn('[SKIPPED DEPENDENCY] SKIP', output) diff --git a/python/TestHarness/tests/test_Diff.py b/python/TestHarness/tests/test_Diff.py index 602878054242..41a3e54081cb 100644 --- a/python/TestHarness/tests/test_Diff.py +++ b/python/TestHarness/tests/test_Diff.py @@ -19,10 +19,10 @@ def testDiffs(self): self.runTests('-i', 'diffs') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.exodiff.*?FAILED \(EXODIFF\)') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.csvdiff.*?FAILED \(CSVDIFF\)') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.exodiff.*?Running exodiff') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.csvdiff.*?Running csvdiff') + self.assertRegex(e.output, r'test_harness\.exodiff.*?FAILED \(EXODIFF\)') + self.assertRegex(e.output, r'test_harness\.csvdiff.*?FAILED \(CSVDIFF\)') + self.assertRegex(e.output, r'test_harness\.exodiff.*?Running exodiff') + self.assertRegex(e.output, r'test_harness\.csvdiff.*?Running csvdiff') # Verify return code is DIFF related (0x81) self.assertIs(0x81, e.returncode) diff --git a/python/TestHarness/tests/test_DiffGold.py b/python/TestHarness/tests/test_DiffGold.py index 0f7fe02fcc92..7105ee725b6c 100644 --- a/python/TestHarness/tests/test_DiffGold.py +++ b/python/TestHarness/tests/test_DiffGold.py @@ -19,7 +19,7 @@ def testDiffs(self): self.runTests('-i', 'diff_golds') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.exodiff.*?FAILED \(EXODIFF\)') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.csvdiff.*?FAILED \(CSVDIFF\)') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.exodiff.*?Running exodiff') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.csvdiff.*?Running csvdiff') + self.assertRegex(e.output, r'test_harness\.exodiff.*?FAILED \(EXODIFF\)') + self.assertRegex(e.output, r'test_harness\.csvdiff.*?FAILED \(CSVDIFF\)') + self.assertRegex(e.output, r'test_harness\.exodiff.*?Running exodiff') + self.assertRegex(e.output, r'test_harness\.csvdiff.*?Running csvdiff') diff --git a/python/TestHarness/tests/test_DisplayRequired.py b/python/TestHarness/tests/test_DisplayRequired.py index d032cfc43531..bdfde22c884d 100644 --- a/python/TestHarness/tests/test_DisplayRequired.py +++ b/python/TestHarness/tests/test_DisplayRequired.py @@ -21,7 +21,7 @@ def testDislpayRequired(self): os.unsetenv('DISPLAY') output = self.runTests('--no-color', '-i', 'display_required') - self.assertRegex(output.decode('utf-8'), r'test_harness\.display_required.*? \[NO DISPLAY\] SKIP') + self.assertRegex(output, r'test_harness\.display_required.*? \[NO DISPLAY\] SKIP') if display: os.putenv('DISPLAY', display) diff --git a/python/TestHarness/tests/test_DistributedMesh.py b/python/TestHarness/tests/test_DistributedMesh.py index c3346027dcea..d52399c740cb 100644 --- a/python/TestHarness/tests/test_DistributedMesh.py +++ b/python/TestHarness/tests/test_DistributedMesh.py @@ -16,11 +16,11 @@ def testSyntax(self): """ # Verify the distributed mesh test is skipped - output = self.runExceptionTests('-i', 'mesh_mode_distributed', '--no-color').decode('utf-8') + output = self.runTests('-i', 'mesh_mode_distributed', '--no-color') self.assertIn('[MESH_MODE!=DISTRIBUTED] SKIP', output) # Verify the distributed mesh test is passing when providing --distributed # To be acurate, test for OK rather than asserting if 'distributed' is # missing from the output. output = self.runTests('--distributed', '-i', 'mesh_mode_distributed') - self.assertRegex(output.decode('utf-8'), 'test_harness.distributed_mesh.*?OK') + self.assertRegex(output, 'test_harness.distributed_mesh.*?OK') diff --git a/python/TestHarness/tests/test_DoLast.py b/python/TestHarness/tests/test_DoLast.py index 54877a2e70a1..4a5b04a96693 100644 --- a/python/TestHarness/tests/test_DoLast.py +++ b/python/TestHarness/tests/test_DoLast.py @@ -20,7 +20,7 @@ def testDoLastDuplicate(self): e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.*?FAILED \(Cyclic or Invalid Dependency Detected!\)') + self.assertRegex(e.output, r'tests/test_harness.*?FAILED \(Cyclic or Invalid Dependency Detected!\)') def testDoLastDepends(self): """ @@ -31,21 +31,21 @@ def testDoLastDepends(self): e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.*?FAILED \(Cyclic or Invalid Dependency Detected!\)') + self.assertRegex(e.output, r'tests/test_harness.*?FAILED \(Cyclic or Invalid Dependency Detected!\)') def testDoLast(self): """ Confirm 'do_last' tested last """ output = self.runTests('--no-color', '-i', 'do_last') - self.assertRegex(output.decode('utf-8'), 'tests/test_harness.a.*?OK\ntests/test_harness.do_last.*?OK') + self.assertRegex(output, 'tests/test_harness.a.*?OK\ntests/test_harness.do_last.*?OK') def testDoLastSkipped(self): """ Confirm 'do_last' is skipped if a test it depends on failed/skipped. """ output = self.runTests('--no-color', '-i', 'do_last_skipped') - self.assertRegex(output.decode('utf-8'), 'test_harness.do_last.*?\[SKIPPED DEPENDENCY\] SKIP') + self.assertRegex(output, 'test_harness.do_last.*?\[SKIPPED DEPENDENCY\] SKIP') def testDoLastName(self): """ @@ -56,4 +56,4 @@ def testDoLastName(self): e = cm.exception - self.assertRegex(e.output.decode('utf-8'), 'test_harness.*?FAILED \(Test named ALL when "prereq = ALL" elsewhere in test spec file!\)') + self.assertRegex(e.output, 'test_harness.*?FAILED \(Test named ALL when "prereq = ALL" elsewhere in test spec file!\)') diff --git a/python/TestHarness/tests/test_DryRun.py b/python/TestHarness/tests/test_DryRun.py index 87044d165ec4..99ca3cffcf9c 100644 --- a/python/TestHarness/tests/test_DryRun.py +++ b/python/TestHarness/tests/test_DryRun.py @@ -17,13 +17,13 @@ def testDryRun(self): """ output = self.runTests('-i', 'diffs', '--dry-run') - self.assertRegex(output.decode('utf-8'), 'test_harness\.exodiff.*?DRY RUN') - self.assertRegex(output.decode('utf-8'), 'test_harness\.csvdiff.*?DRY RUN') + self.assertRegex(output, 'test_harness\.exodiff.*?DRY RUN') + self.assertRegex(output, 'test_harness\.csvdiff.*?DRY RUN') # Skipped caveat test which returns skipped instead of 'DRY RUN' output = self.runTests('--no-color', '-i', 'depend_skip_tests', '--dry-run') - self.assertRegex(output.decode('utf-8'), r'tests/test_harness.always_skipped.*? \[ALWAYS SKIPPED\] SKIP') - self.assertRegex(output.decode('utf-8'), r'tests/test_harness.needs_always_skipped.*? \[SKIPPED DEPENDENCY\] SKIP') + self.assertRegex(output, r'tests/test_harness.always_skipped.*? \[ALWAYS SKIPPED\] SKIP') + self.assertRegex(output, r'tests/test_harness.needs_always_skipped.*? \[SKIPPED DEPENDENCY\] SKIP') # Deleted caveat test which returns a deleted failing tests while # performing a dry run @@ -31,4 +31,4 @@ def testDryRun(self): self.runTests('--no-color', '-i', 'deleted', '-e', '--dry-run') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.deleted.*? \[TEST DELETED TEST\] FAILED \(DELETED\)') + self.assertRegex(e.output, r'test_harness\.deleted.*? \[TEST DELETED TEST\] FAILED \(DELETED\)') diff --git a/python/TestHarness/tests/test_Duplicate.py b/python/TestHarness/tests/test_Duplicate.py index 5e73b23a627e..07835ec2f8e4 100644 --- a/python/TestHarness/tests/test_Duplicate.py +++ b/python/TestHarness/tests/test_Duplicate.py @@ -18,7 +18,7 @@ def testDuplicateOutputs(self): with self.assertRaises(subprocess.CalledProcessError) as cm: self.runTests('-i', 'duplicate_outputs') - output = cm.exception.output.decode('utf-8') + output = cm.exception.output self.assertIn('Tests: d, c', output) self.assertIn('File(s): good_out.e', output) @@ -26,7 +26,7 @@ def testDuplicateOutputs(self): with self.assertRaises(subprocess.CalledProcessError) as cm: self.runTests('-i', 'duplicate_outputs_analyzejacobian') - output = cm.exception.output.decode('utf-8') + output = cm.exception.output self.assertIn('Tests: b, a', output) self.assertIn('File(s): good.i', output) @@ -39,8 +39,8 @@ def testDuplicateOutputsOK(self): output += self.runTests('-i', 'duplicate_outputs_ok', '--heavy') # skip case - self.assertNotRegexpMatches(output.decode('utf-8'), 'skipped_out.e') + self.assertNotRegexpMatches(output, 'skipped_out.e') # heavy case - self.assertNotRegexpMatches(output.decode('utf-8'), 'heavy_out.e') + self.assertNotRegexpMatches(output, 'heavy_out.e') # all - self.assertNotRegexpMatches(output.decode('utf-8'), 'FATAL TEST HARNESS ERROR') + self.assertNotRegexpMatches(output, 'FATAL TEST HARNESS ERROR') diff --git a/python/TestHarness/tests/test_Expect.py b/python/TestHarness/tests/test_Expect.py index da66d6b36218..447015a40f3f 100644 --- a/python/TestHarness/tests/test_Expect.py +++ b/python/TestHarness/tests/test_Expect.py @@ -19,13 +19,13 @@ def testExpect(self): self.runTests('-i', 'expect') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.no_expect_err_pattern.*?FAILED \(EXPECTED ERROR MISSING\)') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.no_expect_out_pattern.*?FAILED \(EXPECTED OUTPUT MISSING\)') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.absent_out_pattern.*?FAILED \(OUTPUT NOT ABSENT\)') + self.assertRegex(e.output, r'test_harness\.no_expect_err_pattern.*?FAILED \(EXPECTED ERROR MISSING\)') + self.assertRegex(e.output, r'test_harness\.no_expect_out_pattern.*?FAILED \(EXPECTED OUTPUT MISSING\)') + self.assertRegex(e.output, r'test_harness\.absent_out_pattern.*?FAILED \(OUTPUT NOT ABSENT\)') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.no_expect_err_literal.*?FAILED \(EXPECTED ERROR MISSING\)') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.no_expect_out_literal.*?FAILED \(EXPECTED OUTPUT MISSING\)') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.absent_out_literal.*?FAILED \(OUTPUT NOT ABSENT\)') + self.assertRegex(e.output, r'test_harness\.no_expect_err_literal.*?FAILED \(EXPECTED ERROR MISSING\)') + self.assertRegex(e.output, r'test_harness\.no_expect_out_literal.*?FAILED \(EXPECTED OUTPUT MISSING\)') + self.assertRegex(e.output, r'test_harness\.absent_out_literal.*?FAILED \(OUTPUT NOT ABSENT\)') def testExpectMissing(self): """ @@ -35,4 +35,4 @@ def testExpectMissing(self): self.runTests('-i', 'expect_missing_params') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'Either "expect_err" or "expect_assert" must be supplied') + self.assertRegex(e.output, r'Either "expect_err" or "expect_assert" must be supplied') diff --git a/python/TestHarness/tests/test_ExtraInfo.py b/python/TestHarness/tests/test_ExtraInfo.py index 291b00c40b25..12f246ab4168 100644 --- a/python/TestHarness/tests/test_ExtraInfo.py +++ b/python/TestHarness/tests/test_ExtraInfo.py @@ -39,7 +39,7 @@ def testExtraInfo(self): # will use the --ignore feature to force the test to run # regardless if that check(s) would otherwise cause this # test to be skipped. - output = self.runTests('-c', '-i', 'extra_info', '--ignore', '-e').decode('utf-8') + output = self.runTests('-c', '-i', 'extra_info', '--ignore', '-e') # Parse the output, and find the caveat string raw_caveat_string = re.findall(r'\[(.*)\]', output) diff --git a/python/TestHarness/tests/test_FailedTests.py b/python/TestHarness/tests/test_FailedTests.py index 096610102228..44bdda057c67 100644 --- a/python/TestHarness/tests/test_FailedTests.py +++ b/python/TestHarness/tests/test_FailedTests.py @@ -8,6 +8,7 @@ #* https://www.gnu.org/licenses/lgpl-2.1.html import subprocess +import tempfile from TestHarnessTestCase import TestHarnessTestCase class TestHarnessTester(TestHarnessTestCase): @@ -17,21 +18,24 @@ def testFailedTests(self): to create a json file containing previous results, and again to only run the test which that has failed. """ - with self.assertRaises(subprocess.CalledProcessError) as cm: - self.runTests('--no-color', '-i', 'always_bad', '--results-file', 'failed-unittest') + with tempfile.TemporaryDirectory() as output_dir: + args = ['--no-color', '--results-file', 'failed-unittest', '-o', output_dir] + kwargs = {'tmp_output': False} + with self.assertRaises(subprocess.CalledProcessError) as cm: + self.runTests(*args, '-i', 'always_bad', **kwargs) - e = cm.exception + e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.always_ok.*?OK') - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.always_bad.*?FAILED \(CODE 1\)') + self.assertRegex(e.output, r'tests/test_harness.always_ok.*?OK') + self.assertRegex(e.output, r'tests/test_harness.always_bad.*?FAILED \(CODE 1\)') - with self.assertRaises(subprocess.CalledProcessError) as cm: - self.runTests('--no-color', '--failed-tests', '--results-file', 'failed-unittest') + with self.assertRaises(subprocess.CalledProcessError) as cm: + self.runTests(*args, '--failed-tests', **kwargs) - e = cm.exception + e = cm.exception - # Verify the passing test is not present - self.assertNotRegex(e.output.decode('utf-8'), r'tests/test_harness.always_ok.*?OK') + # Verify the passing test is not present + self.assertNotRegex(e.output, r'tests/test_harness.always_ok.*?OK') - # Verify the caveat represents a previous result - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.always_bad.*?\[PREVIOUS RESULTS: CODE 1\] FAILED \(CODE 1\)') + # Verify the caveat represents a previous result + self.assertRegex(e.output, r'tests/test_harness.always_bad.*?\[PREVIOUS RESULTS: CODE 1\] FAILED \(CODE 1\)') diff --git a/python/TestHarness/tests/test_Ignore.py b/python/TestHarness/tests/test_Ignore.py index 3f2c056f872f..1ff2c27bef64 100644 --- a/python/TestHarness/tests/test_Ignore.py +++ b/python/TestHarness/tests/test_Ignore.py @@ -16,7 +16,7 @@ def testIgnoreSkip(self): """ # Run a skipped test output = self.runTests('-i', 'ignore_skipped', '--ignore', 'skip') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_skipped.*?OK') + self.assertRegex(output, 'test_harness\.ignore_skipped.*?OK') def testIgnoreHeavy(self): """ @@ -24,7 +24,7 @@ def testIgnoreHeavy(self): """ # Run a skipped heavy test output = self.runTests('-i', 'ignore_heavy', '--ignore', 'heavy') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_heavy.*?OK') + self.assertRegex(output, 'test_harness\.ignore_heavy.*?OK') def testIgnoreCompiler(self): """ @@ -33,7 +33,7 @@ def testIgnoreCompiler(self): """ # Run a skipped compiler test output = self.runTests('-i', 'ignore_compiler', '--ignore', 'compiler') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_compiler.*?OK') + self.assertRegex(output, 'test_harness\.ignore_compiler.*?OK') def testIgnorePlatform(self): """ @@ -42,7 +42,7 @@ def testIgnorePlatform(self): """ # Run a skipped platform test output = self.runTests('-i', 'ignore_platform', '--ignore', 'platform') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_platform.*?OK') + self.assertRegex(output, 'test_harness\.ignore_platform.*?OK') def testIgnorePreReq(self): """ @@ -51,13 +51,13 @@ def testIgnorePreReq(self): """ # Run a skipped prereq test output = self.runTests('--no-color', '-i', 'ignore_prereq', '--ignore', 'prereq') - self.assertRegex(output.decode('utf-8'), 'test_harness\.always_skipped.*? \[ALWAYS SKIPPED\] SKIP') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_skipped_dependency.*?OK') + self.assertRegex(output, 'test_harness\.always_skipped.*? \[ALWAYS SKIPPED\] SKIP') + self.assertRegex(output, 'test_harness\.ignore_skipped_dependency.*?OK') # Check that a dependency test runs when its prereq test is skipped output = self.runTests('--no-color', '-i', 'ignore_prereq', '--ignore', 'skip') - self.assertRegex(output.decode('utf-8'), 'test_harness\.always_skipped.*?OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_skipped_dependency.*?OK') + self.assertRegex(output, 'test_harness\.always_skipped.*?OK') + self.assertRegex(output, 'test_harness\.ignore_skipped_dependency.*?OK') def testIgnoreMultiple(self): """ @@ -66,7 +66,7 @@ def testIgnoreMultiple(self): """ # Run a multiple caveat skipped test by manually supplying each caveat output = self.runTests('-i', 'ignore_multiple', '--ignore', 'skip heavy compiler platform') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_multiple.*?OK') + self.assertRegex(output, 'test_harness\.ignore_multiple.*?OK') def testIgnoreAll(self): """ @@ -75,7 +75,7 @@ def testIgnoreAll(self): """ # Run a multiple caveat skipped test using built in default 'all' output = self.runTests('-i', 'ignore_multiple', '--ignore') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_multiple.*?OK') + self.assertRegex(output, 'test_harness\.ignore_multiple.*?OK') def testIgnoreMissingOne(self): """ @@ -84,7 +84,7 @@ def testIgnoreMissingOne(self): """ # Skip a multiple caveat test by not supplying enough caveats to ignore output = self.runTests('--no-color', '-i', 'ignore_multiple', '--ignore', 'skip heavy compiler') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_multiple.*? \[PLATFORM!=NON_EXISTENT\] SKIP') + self.assertRegex(output, 'test_harness\.ignore_multiple.*? \[PLATFORM!=NON_EXISTENT\] SKIP') def testIgnoreMultiplePreReq(self): """ @@ -93,27 +93,27 @@ def testIgnoreMultiplePreReq(self): """ # Run a multiple caveat prereq test using built in default 'all' output = self.runTests('-i', 'ignore_multiple_prereq', '--ignore') - self.assertRegex(output.decode('utf-8'), 'test_harness\.always_skipped.*?OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_multi_prereq_dependency.*?OK') + self.assertRegex(output, 'test_harness\.always_skipped.*?OK') + self.assertRegex(output, 'test_harness\.ignore_multi_prereq_dependency.*?OK') # Run a multiple caveat prereq test by manually supplying each caveat output = self.runTests('-i', 'ignore_multiple_prereq', '--ignore', 'prereq skip heavy compiler platform') - self.assertRegex(output.decode('utf-8'), 'test_harness\.always_skipped.*?OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_multi_prereq_dependency.*?OK') + self.assertRegex(output, 'test_harness\.always_skipped.*?OK') + self.assertRegex(output, 'test_harness\.ignore_multi_prereq_dependency.*?OK') # Skip a multiple caveat prereq test by not supplying enough caveats to ignore output = self.runTests('--no-color', '-i', 'ignore_multiple_prereq', '--ignore', 'prereq skip heavy compiler') - self.assertRegex(output.decode('utf-8'), 'test_harness\.always_skipped.*?OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_multi_prereq_dependency.*? \[PLATFORM!=NON_EXISTENT\] SKIP') + self.assertRegex(output, 'test_harness\.always_skipped.*?OK') + self.assertRegex(output, 'test_harness\.ignore_multi_prereq_dependency.*? \[PLATFORM!=NON_EXISTENT\] SKIP') # Check that a multiple caveat dependency test runs when its prereq test is skipped # This test may seem redundant, but `prereq` is handled differently than the other caveats output = self.runTests('--no-color', '-i', 'ignore_multiple_prereq', '--ignore', 'prereq heavy compiler platform') - self.assertRegex(output.decode('utf-8'), 'test_harness\.always_skipped.*? \[ALWAYS SKIPPED\] SKIP') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_multi_prereq_dependency.*?OK') + self.assertRegex(output, 'test_harness\.always_skipped.*? \[ALWAYS SKIPPED\] SKIP') + self.assertRegex(output, 'test_harness\.ignore_multi_prereq_dependency.*?OK') # Check that by supplying a very specific set of ignored paramaters, we # can properly trigger a skipped dependency scenario output = self.runTests('--no-color', '-i', 'ignore_multiple_prereq', '--ignore', 'heavy compiler platform') - self.assertRegex(output.decode('utf-8'), 'test_harness\.always_skipped.*? \[ALWAYS SKIPPED\] SKIP') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_multi_prereq_dependency.*? \[SKIPPED DEPENDENCY\] SKIP') + self.assertRegex(output, 'test_harness\.always_skipped.*? \[ALWAYS SKIPPED\] SKIP') + self.assertRegex(output, 'test_harness\.ignore_multi_prereq_dependency.*? \[SKIPPED DEPENDENCY\] SKIP') diff --git a/python/TestHarness/tests/test_LongRunning.py b/python/TestHarness/tests/test_LongRunning.py index 8a00eabc17d6..69a74ea12a81 100644 --- a/python/TestHarness/tests/test_LongRunning.py +++ b/python/TestHarness/tests/test_LongRunning.py @@ -14,6 +14,6 @@ def testLongRunningStatus(self): """ Test for RUNNING status in the TestHarness """ - output = self.runTests('-i', 'long_running').decode('utf-8') + output = self.runTests('-i', 'long_running') self.assertIn('RUNNING', output) self.assertIn('[FINISHED]', output) diff --git a/python/TestHarness/tests/test_LongestJobs.py b/python/TestHarness/tests/test_LongestJobs.py index d324dcd8fd23..fef405086faa 100644 --- a/python/TestHarness/tests/test_LongestJobs.py +++ b/python/TestHarness/tests/test_LongestJobs.py @@ -18,7 +18,7 @@ def testLongestJobs(self): with self.assertRaises(subprocess.CalledProcessError) as cm: self.runTests('-i', 'longest_jobs', '--longest-jobs', '4') - output = cm.exception.output.decode('utf-8') + output = cm.exception.output self.assertIn('4 longest running jobs', output) self.assertRegex(output, r'(?s)longest running jobs.*run_1') @@ -30,7 +30,7 @@ def testLongestJobsNoneCompleted(self): """ Test for --longest-jobs in the TestHarness with no jobs ran. """ - output = self.runTests('-i', 'longest_jobs', '--re', 'foo', '--longest-jobs', '100').decode('utf-8') + output = self.runTests('-i', 'longest_jobs', '--re', 'foo', '--longest-jobs', '100') self.assertIn('100 longest running jobs', output) self.assertNotRegex(output, r'(?s)longest running jobs.*') diff --git a/python/TestHarness/tests/test_MinADSize.py b/python/TestHarness/tests/test_MinADSize.py index a160eede58b7..8ef6b69e0074 100644 --- a/python/TestHarness/tests/test_MinADSize.py +++ b/python/TestHarness/tests/test_MinADSize.py @@ -15,6 +15,6 @@ def testMinADSize(self): """ Test AD vector size """ - output = self.runTests('-i', 'ad_size', '--no-color').decode('utf-8') + output = self.runTests('-i', 'ad_size', '--no-color') self.assertRegex(output, r'tests/test_harness.enough \.* OK') self.assertRegex(output, r'tests/test_harness\.too_few \.* \[MINIMUM AD SIZE 1000 NEEDED, BUT MOOSE IS CONFIGURED WITH \d+\] SKIP') diff --git a/python/TestHarness/tests/test_MissingGold.py b/python/TestHarness/tests/test_MissingGold.py index 7e23f5929074..6b2235d44d49 100644 --- a/python/TestHarness/tests/test_MissingGold.py +++ b/python/TestHarness/tests/test_MissingGold.py @@ -19,8 +19,8 @@ def testMissingGold(self): self.runTests('-i', 'missing_gold') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), 'test_harness\.exodiff.*?FAILED \(MISSING GOLD FILE\)') - self.assertRegex(e.output.decode('utf-8'), 'test_harness\.csvdiff.*?FAILED \(MISSING GOLD FILE\)') + self.assertRegex(e.output, 'test_harness\.exodiff.*?FAILED \(MISSING GOLD FILE\)') + self.assertRegex(e.output, 'test_harness\.csvdiff.*?FAILED \(MISSING GOLD FILE\)') # Verify return code is a general failure related (0x80) self.assertIs(0x80, e.returncode) diff --git a/python/TestHarness/tests/test_ParserErrors.py b/python/TestHarness/tests/test_ParserErrors.py index 6cd57851cc3d..907d6a7c25ff 100644 --- a/python/TestHarness/tests/test_ParserErrors.py +++ b/python/TestHarness/tests/test_ParserErrors.py @@ -17,5 +17,5 @@ def testSyntax(self): # check that parser errors print correctly # TODO: Are there more we can test? - output = self.runExceptionTests('-i', 'parse_errors').decode('utf-8') + output = self.runExceptionTests('-i', 'parse_errors') self.assertIn('duplicate parameter', output) diff --git a/python/TestHarness/tests/test_PythonVersion.py b/python/TestHarness/tests/test_PythonVersion.py index e3f9e4002944..e17871caf41c 100644 --- a/python/TestHarness/tests/test_PythonVersion.py +++ b/python/TestHarness/tests/test_PythonVersion.py @@ -12,7 +12,7 @@ class TestHarnessTester(TestHarnessTestCase): def testVersion(self): """Test that python=... is working.""" - output = self.runTests('-i', 'python_version').decode('utf-8') + output = self.runTests('-i', 'python_version') self.assertIn('[PYTHON != 2]', output) self.assertIn('[PYTHON != 3.5]', output) self.assertIn('[PYTHON != 3.4.1]', output) diff --git a/python/TestHarness/tests/test_RaceConditions.py b/python/TestHarness/tests/test_RaceConditions.py index f2a4922f132f..806b3192f448 100644 --- a/python/TestHarness/tests/test_RaceConditions.py +++ b/python/TestHarness/tests/test_RaceConditions.py @@ -20,4 +20,4 @@ def testRaceConditions(self): with self.assertRaises(subprocess.CalledProcessError) as cm: self.runTests('--pedantic-checks', '-i', 'output_clobber_simple') e = cm.exception - self.assertIn('Diagnostic analysis', e.output.decode('utf-8')) + self.assertIn('Diagnostic analysis', e.output) diff --git a/python/TestHarness/tests/test_Recover.py b/python/TestHarness/tests/test_Recover.py index d6a0a445752a..de8c8f7c725c 100644 --- a/python/TestHarness/tests/test_Recover.py +++ b/python/TestHarness/tests/test_Recover.py @@ -15,7 +15,7 @@ def testRecover(self): """ Test that --recover returns two passing statuses (part1 and the OK) """ - output = self.runTests('-i', 'always_ok', '--recover').decode('utf-8') + output = self.runTests('-i', 'always_ok', '--recover') self.assertIn('PART1', output) self.assertIn('RECOVER', output) @@ -30,8 +30,8 @@ def testRecoverPart1Fail(self): Test that --recover still checks status on Part1 tests """ with self.assertRaises(subprocess.CalledProcessError) as cm: - self.runTests('-i', 'exception_transient', '--recover').decode('utf-8') + self.runTests('-i', 'exception_transient', '--recover') e = cm.exception - output = e.output.decode('utf-8') + output = e.output self.assertRegex(output, r'test_harness.*?part1.*?FAILED \(CRASH\)') diff --git a/python/TestHarness/tests/test_Replay.py b/python/TestHarness/tests/test_Replay.py index 6e9c9504a24d..ab044e73b0a0 100644 --- a/python/TestHarness/tests/test_Replay.py +++ b/python/TestHarness/tests/test_Replay.py @@ -11,6 +11,7 @@ import re import subprocess import shutil +import tempfile from TestHarnessTestCase import TestHarnessTestCase @@ -40,30 +41,40 @@ def reCompile(self): def testReplay(self): """ Test ability to replay back previous run results """ - output_a = self.runTests('--verbose', '--timing', '-i', 'always_ok', '--results-file', 'unittest_Replay') - output_b = self.runTests('--verbose', '--timing', '--show-last-run', '--results-file', 'unittest_Replay') - compile = self.reCompile() - formated_a = compile.findall(str(output_a)) - formated_b = compile.findall(str(output_b)) + with tempfile.TemporaryDirectory() as output_dir: + base_args = ['--verbose', '-c', '--timing', '--results-file', 'unittest_Replay', '-o', output_dir] + base_kwargs = {'tmp_output': False} + output_a = self.runTests(*base_args, '-i', 'always_ok', **base_kwargs) + output_b = self.runTests(*base_args, '--show-last-run', **base_kwargs) - if formated_a != formated_b: - self.fail(f'--show-last-run did not match last run\n\n{formated_a}\n\n{formated_b}') + # The only difference should be the total run time, so replace the run time + # from the first with the run time from the second + def parseSummary(output): + search = re.search(r'Ran (\d+) tests in (\d+.\d+) seconds', output) + self.assertTrue(search is not None) + return int(search.group(1)), float(search.group(2)) + num_tests, total_time = parseSummary(output_a) + other_num_tests, other_total_time = parseSummary(output_b) + self.assertEqual(num_tests, other_num_tests) + output_b = output_b.replace(f'Ran {num_tests} tests in {other_total_time} seconds', + f'Ran {num_tests} tests in {total_time} seconds') + self.assertEqual(output_a, output_b) def testDiffReplay(self): """ Verify that the feature fails when asked to capture new output """ - output_a = self.runTests('--verbose', '--timing', '-i', 'always_ok', '--results-file', 'unittest_Replay') - # --re=doesenotexist will produce no output (or rather different output than the above) - output_b = self.runTests('--verbose', '--timing', '--show-last-run', '--results-file', 'unittest_Replay', '--re=doesnotexist') - compile = self.reCompile() - formated_a = compile.findall(str(output_a)) - formated_b = compile.findall(str(output_b)) - - if formated_a == formated_b: - self.fail(f'--show-last-run matched when it should not have') + with tempfile.TemporaryDirectory() as output_dir: + base_args = ['--verbose', '--timing', '--results-file', 'unittest_Replay', '-o', output_dir] + base_kwargs = {'tmp_output': False} + output_a = self.runTests(*base_args, '-i', 'always_ok', **base_kwargs) + # --re=doesenotexist will produce no output (or rather different output than the above) + output_b = self.runTests(*base_args, '--show-last-run', '--re=doesnotexist', **base_kwargs) + self.assertIn('Ran 1 tests in', output_a) + self.assertIn('Ran 0 tests in', output_b) def testNoResultsFile(self): """ Verify the TestHarness errors correctly when there is no results file to work with """ - with self.assertRaises(subprocess.CalledProcessError) as cm: - self.runTests('--show-last-run', '--results-file', 'non_existent') - e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'A previous run does not exist') + with tempfile.TemporaryDirectory() as output_dir: + with self.assertRaises(subprocess.CalledProcessError) as cm: + self.runTests('--show-last-run', '--results-file', 'non_existent', '-o', output_dir, tmp_output=False) + e = cm.exception + self.assertIn(f'The previous run {output_dir}/non_existent does not exist', e.output) diff --git a/python/TestHarness/tests/test_ReportSkipped.py b/python/TestHarness/tests/test_ReportSkipped.py index 83e2b5c085bb..c6f1e53043df 100644 --- a/python/TestHarness/tests/test_ReportSkipped.py +++ b/python/TestHarness/tests/test_ReportSkipped.py @@ -16,9 +16,9 @@ def testSyntax(self): """ # Verify the skipped test _does_ appear - output = self.runExceptionTests('--no-color', '-i', 'ignore_skipped').decode('utf-8') + output = self.runTests('--no-color', '-i', 'ignore_skipped') self.assertIn('[ALWAYS SKIPPED] SKIP', output) # Verify the skipped test does _not_ appear - output = self.runTests('--no-color', '--no-report', '-i', 'ignore_skipped').decode('utf-8') + output = self.runTests('--no-color', '--no-report', '-i', 'ignore_skipped') self.assertNotIn('[ALWAYS SKIPPED] SKIP', output) diff --git a/python/TestHarness/tests/test_RequiredApps.py b/python/TestHarness/tests/test_RequiredApps.py index 90b5546a38d5..377e2015908f 100644 --- a/python/TestHarness/tests/test_RequiredApps.py +++ b/python/TestHarness/tests/test_RequiredApps.py @@ -15,6 +15,6 @@ def testRequiredApps(self): Test that the required_apps check works """ output = self.runTests('--no-color', '-i', 'required_apps') - self.assertRegex(output.decode('utf-8'), r'test_harness\.bad_app.*? \[APP DOESNOTEXIST NOT REGISTERED IN EXECUTABLE\] SKIP') - self.assertRegex(output.decode('utf-8'), r'test_harness\.good_app.*? OK') - self.checkStatus(output.decode('utf-8'), passed=1, skipped=1) + self.assertRegex(output, r'test_harness\.bad_app.*? \[APP DOESNOTEXIST NOT REGISTERED IN EXECUTABLE\] SKIP') + self.assertRegex(output, r'test_harness\.good_app.*? OK') + self.checkStatus(output, passed=1, skipped=1) diff --git a/python/TestHarness/tests/test_RequiredObjects.py b/python/TestHarness/tests/test_RequiredObjects.py index 38987a18fa2b..66aa5cb09e04 100644 --- a/python/TestHarness/tests/test_RequiredObjects.py +++ b/python/TestHarness/tests/test_RequiredObjects.py @@ -15,6 +15,6 @@ def testRequiredObjects(self): Test that the required_objects check works """ output = self.runTests('--no-color', '-i', 'required_objects') - self.assertRegex(output.decode('utf-8'), r'test_harness\.bad_object.*? \[DOESNOTEXIST NOT FOUND IN EXECUTABLE\] SKIP') - self.assertRegex(output.decode('utf-8'), r'test_harness\.good_objects.*? OK') - self.checkStatus(output.decode('utf-8'), passed=1, skipped=1) + self.assertRegex(output, r'test_harness\.bad_object.*? \[DOESNOTEXIST NOT FOUND IN EXECUTABLE\] SKIP') + self.assertRegex(output, r'test_harness\.good_objects.*? OK') + self.checkStatus(output, passed=1, skipped=1) diff --git a/python/TestHarness/tests/test_SchemaDiff.py b/python/TestHarness/tests/test_SchemaDiff.py index 8a8c646ddf90..836ff2c3380b 100644 --- a/python/TestHarness/tests/test_SchemaDiff.py +++ b/python/TestHarness/tests/test_SchemaDiff.py @@ -13,7 +13,7 @@ class TestHarnessTester(TestHarnessTestCase): def testSchemaDiff(self): output = self.runExceptionTests('-i', 'schemadiff') - self.assertRegex(output.decode('utf-8'), r'test_harness\.schema_jsondiff.*?FAILED \(SCHEMADIFF\)') - self.assertRegex(output.decode('utf-8'), r'test_harness\.schema_xmldiff.*?FAILED \(SCHEMADIFF\)') - self.assertRegex(output.decode('utf-8'), r'test_harness\.schema_invalid_json.*?FAILED \(LOAD FAILED\)') - self.assertRegex(output.decode('utf-8'), r'test_harness\.schema_invalid_xml.*?FAILED \(LOAD FAILED\)') + self.assertRegex(output, r'test_harness\.schema_jsondiff.*?FAILED \(SCHEMADIFF\)') + self.assertRegex(output, r'test_harness\.schema_xmldiff.*?FAILED \(SCHEMADIFF\)') + self.assertRegex(output, r'test_harness\.schema_invalid_json.*?FAILED \(LOAD FAILED\)') + self.assertRegex(output, r'test_harness\.schema_invalid_xml.*?FAILED \(LOAD FAILED\)') diff --git a/python/TestHarness/tests/test_ShouldExecute.py b/python/TestHarness/tests/test_ShouldExecute.py index a174b92ac3ad..9940bca5b07b 100644 --- a/python/TestHarness/tests/test_ShouldExecute.py +++ b/python/TestHarness/tests/test_ShouldExecute.py @@ -20,6 +20,6 @@ def testShouldExecute(self): self.runTests('-i', 'should_execute') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.should_execute_true_ok.*?OK') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.should_execute_false_ok.*?OK') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.should_execute_true_fail.*?FAILED \(EXODIFF\)') + self.assertRegex(e.output, r'test_harness\.should_execute_true_ok.*?OK') + self.assertRegex(e.output, r'test_harness\.should_execute_false_ok.*?OK') + self.assertRegex(e.output, r'test_harness\.should_execute_true_fail.*?FAILED \(EXODIFF\)') diff --git a/python/TestHarness/tests/test_SoftHeavyDependency.py b/python/TestHarness/tests/test_SoftHeavyDependency.py index 718d46e63762..4c9f05475956 100644 --- a/python/TestHarness/tests/test_SoftHeavyDependency.py +++ b/python/TestHarness/tests/test_SoftHeavyDependency.py @@ -16,31 +16,31 @@ def testNotHeavy(self): """ output = self.runTests('--no-color', '-i', 'heavy_on_not_heavy') # The following should be skipped - self.assertRegex(output.decode('utf-8'), 'test_harness\.heavy_a .*? \[HEAVY\] SKIP') - self.assertRegex(output.decode('utf-8'), 'test_harness\.heavy_b .*? \[HEAVY\] SKIP') - self.assertRegex(output.decode('utf-8'), 'test_harness\.heavy_on_not_heavy .*? \[HEAVY\] SKIP') - self.assertRegex(output.decode('utf-8'), 'test_harness\.heavy_on_not_heavy_a_and_not_heavy_b .*? \[HEAVY\] SKIP') + self.assertRegex(output, 'test_harness\.heavy_a .*? \[HEAVY\] SKIP') + self.assertRegex(output, 'test_harness\.heavy_b .*? \[HEAVY\] SKIP') + self.assertRegex(output, 'test_harness\.heavy_on_not_heavy .*? \[HEAVY\] SKIP') + self.assertRegex(output, 'test_harness\.heavy_on_not_heavy_a_and_not_heavy_b .*? \[HEAVY\] SKIP') # The following should not be skipped, they should finish with an OK status. - self.assertRegex(output.decode('utf-8'), 'test_harness\.singleton_a .*? OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.singleton_b .*? OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.not_heavy .*? OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.not_heavy_a .*? OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.not_heavy_b .*? OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.not_heavy_on_singleton_a_and_singleton_b .*? OK') + self.assertRegex(output, 'test_harness\.singleton_a .*? OK') + self.assertRegex(output, 'test_harness\.singleton_b .*? OK') + self.assertRegex(output, 'test_harness\.not_heavy .*? OK') + self.assertRegex(output, 'test_harness\.not_heavy_a .*? OK') + self.assertRegex(output, 'test_harness\.not_heavy_b .*? OK') + self.assertRegex(output, 'test_harness\.not_heavy_on_singleton_a_and_singleton_b .*? OK') # The following should run, and should not list [implict heavy] caveat. # (a little redundant, but I don't see a way to check for this and the OK test above, in one go) - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.singleton_a .*? \[IMPLICT HEAVY\] OK') - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.singleton_b .*? \[IMPLICT HEAVY\] OK') - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.not_heavy .*? \[IMPLICT HEAVY\] OK') - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.not_heavy_a .*? \[IMPLICT HEAVY\] OK') - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.not_heavy_b .*? \[IMPLICT HEAVY\] OK') - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.not_heavy_on_singleton_a_and_singleton_b .*? \[IMPLICT HEAVY\] OK') + self.assertNotRegex(output, 'test_harness\.singleton_a .*? \[IMPLICT HEAVY\] OK') + self.assertNotRegex(output, 'test_harness\.singleton_b .*? \[IMPLICT HEAVY\] OK') + self.assertNotRegex(output, 'test_harness\.not_heavy .*? \[IMPLICT HEAVY\] OK') + self.assertNotRegex(output, 'test_harness\.not_heavy_a .*? \[IMPLICT HEAVY\] OK') + self.assertNotRegex(output, 'test_harness\.not_heavy_b .*? \[IMPLICT HEAVY\] OK') + self.assertNotRegex(output, 'test_harness\.not_heavy_on_singleton_a_and_singleton_b .*? \[IMPLICT HEAVY\] OK') # Special: caveat placements are random. Only check that it is skipped. # [skipped dependency,HEAVY] SKIP versus [HEAVY,skipped dependency] SKIP - self.assertRegex(output.decode('utf-8'), 'test_harness\.heavy_on_heavy_a_and_heavy_b .*?SKIP') + self.assertRegex(output, 'test_harness\.heavy_on_heavy_a_and_heavy_b .*?SKIP') def testSoftHeavy(self): """ @@ -51,25 +51,25 @@ def testSoftHeavy(self): """ output = self.runTests('--no-color', '-i', 'heavy_on_not_heavy', '--heavy') # The following should run, and mention the additional [implicit heavy] caveat. - self.assertRegex(output.decode('utf-8'), 'test_harness\.not_heavy .*? \[IMPLICIT HEAVY\] OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.not_heavy_a .*? \[IMPLICIT HEAVY\] OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.not_heavy_b .*? \[IMPLICIT HEAVY\] OK') + self.assertRegex(output, 'test_harness\.not_heavy .*? \[IMPLICIT HEAVY\] OK') + self.assertRegex(output, 'test_harness\.not_heavy_a .*? \[IMPLICIT HEAVY\] OK') + self.assertRegex(output, 'test_harness\.not_heavy_b .*? \[IMPLICIT HEAVY\] OK') # The following should not be skipped, they should finish with an OK status. - self.assertRegex(output.decode('utf-8'), 'test_harness\.heavy_a .*? OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.heavy_b .*? OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.heavy_on_not_heavy .*? OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.heavy_on_heavy_a_and_heavy_b .*? OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.heavy_on_not_heavy_a_and_not_heavy_b .*? OK') + self.assertRegex(output, 'test_harness\.heavy_a .*? OK') + self.assertRegex(output, 'test_harness\.heavy_b .*? OK') + self.assertRegex(output, 'test_harness\.heavy_on_not_heavy .*? OK') + self.assertRegex(output, 'test_harness\.heavy_on_heavy_a_and_heavy_b .*? OK') + self.assertRegex(output, 'test_harness\.heavy_on_not_heavy_a_and_not_heavy_b .*? OK') # The following should not be skipped, and should not list [implicit heavy] caveat. # (a little redundant, but I don't see a way to check for this and the OK test above, in one go) - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.heavy_a .*? \[IMPLICT HEAVY\] OK') - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.heavy_b .*? \[IMPLICT HEAVY\] OK') - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.heavy_on_not_heavy .*? \[IMPLICT HEAVY\] OK') - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.heavy_on_heavy_a_and_heavy_b .*? \[IMPLICT HEAVY\] OK') - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.heavy_on_not_heavy_a_and_not_heavy_b .*? \[IMPLICT HEAVY\] OK') + self.assertNotRegex(output, 'test_harness\.heavy_a .*? \[IMPLICT HEAVY\] OK') + self.assertNotRegex(output, 'test_harness\.heavy_b .*? \[IMPLICT HEAVY\] OK') + self.assertNotRegex(output, 'test_harness\.heavy_on_not_heavy .*? \[IMPLICT HEAVY\] OK') + self.assertNotRegex(output, 'test_harness\.heavy_on_heavy_a_and_heavy_b .*? \[IMPLICT HEAVY\] OK') + self.assertNotRegex(output, 'test_harness\.heavy_on_not_heavy_a_and_not_heavy_b .*? \[IMPLICT HEAVY\] OK') # The following should not run at all (the test is silent, and not displayed in the output) - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.singleton.*?') - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.not_heavy_on_singleton_a_and_singleton_b.*?') + self.assertNotRegex(output, 'test_harness\.singleton.*?') + self.assertNotRegex(output, 'test_harness\.not_heavy_on_singleton_a_and_singleton_b.*?') diff --git a/python/TestHarness/tests/test_Syntax.py b/python/TestHarness/tests/test_Syntax.py index 84321c3fdb2d..7de9146bf8d8 100644 --- a/python/TestHarness/tests/test_Syntax.py +++ b/python/TestHarness/tests/test_Syntax.py @@ -16,22 +16,22 @@ def testSyntax(self): """ # Test that the SYNTAX PASS status message properly displays - output = self.runTests('-i', 'syntax').decode('utf-8') + output = self.runTests('-i', 'syntax') self.assertIn('SYNTAX PASS', output) # Test that the SYNTAX PASS status message properly displays - output = self.runTests('--check-input', '-i', 'syntax').decode('utf-8') + output = self.runTests('--check-input', '-i', 'syntax') self.assertIn('SYNTAX PASS', output) # Check that the _non_ SYNTAX test was not run - output = self.runTests('--check-input', '-i', 'no_syntax').decode('utf-8') + output = self.runTests('--check-input', '-i', 'no_syntax') self.assertNotIn('SYNTAX PASS', output) # Check that _thee_ SYNTAX test is not run - output = self.runTests('--no-check-input', '-i', 'syntax').decode('utf-8') + output = self.runTests('--no-check-input', '-i', 'syntax') self.assertNotIn('SYNTAX PASS', output) # Check that it is skipped when running valgrind - output = self.runTests('--valgrind', '-i', 'syntax').decode('utf-8') + output = self.runTests('--valgrind', '-i', 'syntax') self.assertIn('CHECK_INPUT==TRUE', output) self.checkStatus(output, skipped=1) diff --git a/python/TestHarness/tests/test_Timeout.py b/python/TestHarness/tests/test_Timeout.py index 4237807231ef..e36b56b106e9 100644 --- a/python/TestHarness/tests/test_Timeout.py +++ b/python/TestHarness/tests/test_Timeout.py @@ -19,7 +19,7 @@ def testTimeout(self): self.runTests('-i', 'timeout') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), 'test_harness\.timeout.*?TIMEOUT') + self.assertRegex(e.output, 'test_harness\.timeout.*?TIMEOUT') # Verify return code is TIMEOUT related (0x1) self.assertIs(0x1, e.returncode) @@ -34,7 +34,7 @@ def testTimeoutEnv(self): os.environ.pop('MOOSE_TEST_MAX_TIME') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), 'test_harness\.timeout.*?TIMEOUT') + self.assertRegex(e.output, 'test_harness\.timeout.*?TIMEOUT') # Verify return code is TIMEOUT related (0x1) self.assertIs(0x1, e.returncode) diff --git a/python/TestHarness/tests/test_TrimOutput.py b/python/TestHarness/tests/test_TrimOutput.py index 17c739b14811..3979793859f9 100644 --- a/python/TestHarness/tests/test_TrimOutput.py +++ b/python/TestHarness/tests/test_TrimOutput.py @@ -16,14 +16,14 @@ def testTrimOutput(self): Verify output exceeded buffer, and is therfore trimmed """ output = self.runTests('--no-color', '-i', 'trimmed_output', '-v') - self.assertIn('Output trimmed', output.decode('utf-8')) + self.assertIn('Output trimmed', output) def testNoTrimOutput(self): """ Verify trimming did not take place """ output = self.runTests('--no-color', '-i', 'always_ok', '-v') - self.assertNotIn('Output trimmed', output.decode('utf-8')) + self.assertNotIn('Output trimmed', output) def testNoTrimmedOutputOnError(self): """ @@ -34,4 +34,4 @@ def testNoTrimmedOutputOnError(self): self.runTests('--no-color', '-i', 'no_trim_on_error', '--no-trimmed-output-on-error', '-v') e = cm.exception - self.assertNotIn('Output trimmed', e.output.decode('utf-8')) + self.assertNotIn('Output trimmed', e.output) diff --git a/python/TestHarness/tests/test_UnknownParam.py b/python/TestHarness/tests/test_UnknownParam.py index e3a4d662a962..8c77f0de4d9e 100644 --- a/python/TestHarness/tests/test_UnknownParam.py +++ b/python/TestHarness/tests/test_UnknownParam.py @@ -15,4 +15,4 @@ def testUnknownParam(self): self.runTests('--no-color', '-i', 'unknown_param') self.assertIn('unknown_param:5: unused parameter "not_a_parameter"', - cm.exception.output.decode('utf-8')) + cm.exception.output) diff --git a/python/TestHarness/tests/test_UnknownPrereq.py b/python/TestHarness/tests/test_UnknownPrereq.py index 1da1d53a1fae..2887211f136c 100644 --- a/python/TestHarness/tests/test_UnknownPrereq.py +++ b/python/TestHarness/tests/test_UnknownPrereq.py @@ -19,4 +19,4 @@ def testUnknownPrereq(self): self.runTests('-i', 'unknown_prereq') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.foo.*?FAILED \(unknown dependency non_existent\)') + self.assertRegex(e.output, r'tests/test_harness.foo.*?FAILED \(unknown dependency non_existent\)') diff --git a/python/TestHarness/tests/test_UnreadableOutput.py b/python/TestHarness/tests/test_UnreadableOutput.py index 21ef17dbb9f2..d1712a58d8e4 100644 --- a/python/TestHarness/tests/test_UnreadableOutput.py +++ b/python/TestHarness/tests/test_UnreadableOutput.py @@ -19,4 +19,4 @@ def testUnreadableOutput(self): with self.assertRaises(subprocess.CalledProcessError) as cm: self.runTests('-i', 'non_unicode') e = cm.exception - self.assertIn('non-unicode characters in output', e.output.decode('utf-8')) + self.assertIn('non-unicode characters in output', e.output) diff --git a/python/TestHarness/tests/test_WorkingDirectory.py b/python/TestHarness/tests/test_WorkingDirectory.py index a959ad87398f..754fd6a49118 100644 --- a/python/TestHarness/tests/test_WorkingDirectory.py +++ b/python/TestHarness/tests/test_WorkingDirectory.py @@ -17,7 +17,7 @@ def testWorkingDirectoryGood(self): """ # Test a working scenario output = self.runTests('--no-color', '-i', 'working_directory', '--re', 'relative_and_available') - self.assertRegex(output.decode('utf-8'), 'tests/test_harness.relative_and_available.*? OK') + self.assertRegex(output, 'tests/test_harness.relative_and_available.*? OK') def testDependency(self): @@ -26,7 +26,7 @@ def testDependency(self): self.runTests('--no-color', '-i', 'working_directory') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.depend_on_available.*? OK') + self.assertRegex(e.output, r'tests/test_harness.depend_on_available.*? OK') def testAbsolutePath(self): # Test we catch an absolute path @@ -34,7 +34,7 @@ def testAbsolutePath(self): self.runTests('--no-color', '-i', 'working_directory', '--re', 'absolute_path') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.absolute_path.*? FAILED \(ABSOLUTE PATH DETECTED\)') + self.assertRegex(e.output, r'tests/test_harness.absolute_path.*? FAILED \(ABSOLUTE PATH DETECTED\)') def testDirectoryNotFound(self): # Test we catch a directory not found @@ -42,7 +42,7 @@ def testDirectoryNotFound(self): self.runTests('--no-color', '-i', 'working_directory', '--re', 'non_existent') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.non_existent.*? FAILED \(WORKING DIRECTORY NOT FOUND\)') + self.assertRegex(e.output, r'tests/test_harness.non_existent.*? FAILED \(WORKING DIRECTORY NOT FOUND\)') def testExodiff(self): ## Specific Testers ## @@ -51,7 +51,7 @@ def testExodiff(self): self.runTests('--no-color', '-i', 'working_directory', '--re', 'exodiff') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.exodiff.*? FAILED \(EXODIFF\)') + self.assertRegex(e.output, r'tests/test_harness.exodiff.*? FAILED \(EXODIFF\)') def testCSVDiff(self): # csvdiff can access sub directories @@ -59,7 +59,7 @@ def testCSVDiff(self): self.runTests('--no-color', '-i', 'working_directory', '--re', 'csvdiff') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.csvdiff.*? FAILED \(Override inputs not the same length\)') + self.assertRegex(e.output, r'tests/test_harness.csvdiff.*? FAILED \(Override inputs not the same length\)') def testRunException(self): # RunException can access sub directories @@ -67,4 +67,4 @@ def testRunException(self): self.runTests('--no-color', '-i', 'working_directory', '--re', 'runexception') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.runexception.*? FAILED \(EXPECTED ERROR MISSING\)') + self.assertRegex(e.output, r'tests/test_harness.runexception.*? FAILED \(EXPECTED ERROR MISSING\)') diff --git a/python/TestHarness/tests/test_WriteResults.py b/python/TestHarness/tests/test_WriteResults.py index 1845d3e4cd5e..f105c130e6ad 100644 --- a/python/TestHarness/tests/test_WriteResults.py +++ b/python/TestHarness/tests/test_WriteResults.py @@ -7,62 +7,60 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html -import shutil, os, subprocess +import json, os, subprocess, tempfile from TestHarnessTestCase import TestHarnessTestCase class TestHarnessTester(TestHarnessTestCase): - def setUp(self): - """ - setUp occurs before every test. Clean up previous results file - """ - self.output_dir = os.path.join(os.getenv('MOOSE_DIR'), 'test', 'WriteResults_OUTPUT') - - try: - # remove previous results file - shutil.rmtree(self.output_dir) - except: - pass - def tearDown(self): """ tearDown occurs after every test. """ self.setUp() - def testWriteOK(self): - """ Test ability to write separate OK test --sep-files-ok """ - self.runTests('--no-color', '-i', 'always_ok', '--sep-files-ok', '--output-dir', self.output_dir) - if not os.path.exists(os.path.join(self.output_dir, 'test_harness.always_ok.OK.txt')): - self.fail('Failed to create sep-files-ok') + def checkFilesExist(self, output_dir, tests, output_object_names): + # The directories within the test directory where these tests reside + test_folders = ['tests', 'test_harness'] + # The complete path to the directory where the tests reside + test_base_path = os.path.join(os.getenv('MOOSE_DIR'), 'test', *test_folders) + # The complete path where the output should reside + output_base_path = os.path.join(output_dir, *test_folders) - with self.assertRaises(subprocess.CalledProcessError): - self.runTests('--no-color', '-i', 'diffs', '--sep-files-ok', '--output-dir', self.output_dir) + # Load the previous results + with open(os.path.join(output_dir, '.previous_test_results.json')) as f: + results = json.load(f) + test_results = results['TESTS'] + # We should only have one test spec + self.assertEqual(1, len(test_results)) + # The test spec should be in the results + self.assertIn(test_base_path, test_results) + test_spec_results = test_results[test_base_path] + # The number of tests in the test spec should be the number provided + self.assertEqual(len(tests), len(test_spec_results)) - if (os.path.exists(os.path.join(self.output_dir, 'test_harness.exodiff.DIFF.txt')) - or os.path.exists(os.path.join(self.output_dir, 'test_harness.exodiff.OK.txt'))): - self.fail('Test results which failed were created when asked NOT to do so: --sep-files-ok') - - def testWriteFail(self): - """ Test ability to write separate Fail test --sep-files-fail """ - with self.assertRaises(subprocess.CalledProcessError): - self.runTests('--no-color', '-i', 'diffs', '--sep-files-fail', '--output-dir', self.output_dir) - - if not (os.path.exists(os.path.join(self.output_dir, 'test_harness.exodiff.DIFF.txt')) - and os.path.exists(os.path.join(self.output_dir, 'test_harness.csvdiff.DIFF.txt'))): - self.fail('Failed to create sep-files-fail') - - self.runTests('--no-color', '-i', 'always_ok', '--sep-files-fail', '--output-dir', self.output_dir) - if os.path.exists(os.path.join(self.output_dir, 'test_harness.always_ok.OK.txt')): - self.fail('Test results which passed were created when asked NOT to do so: --sep-files-fail') + for test in tests: + # The test name should be in the test spec results + test_name_short = f'{"/".join(test_folders)}.{test}' + self.assertIn(test_name_short, test_spec_results) + test_results = test_spec_results[test_name_short] + # Get the output files from the test spec + result_output_files = test_results['OUTPUT_FILES'] + # Make sure each output file exists and is set in the results file + for name in output_object_names: + output_path = f'{output_base_path}/{test}.{name}_out.txt' + self.assertTrue(os.path.exists(output_path)) + self.assertEqual(result_output_files[name], output_path) + # And make sure that we don't have output from any other objects + for name, output_path in result_output_files.items(): + if name not in output_object_names: + self.assertEqual(output_path, None) def testWriteAll(self): """ Test write all output files --sep-files """ - with self.assertRaises(subprocess.CalledProcessError): - self.runTests('--no-color', '-i', 'diffs', '--sep-files', '--output-dir', self.output_dir) - - self.runTests('--no-color', '-i', 'always_ok', '--sep-files', '--output-dir', self.output_dir) + with tempfile.TemporaryDirectory() as output_dir: + with self.assertRaises(subprocess.CalledProcessError): + self.runTests('--no-color', '-i', 'diffs', '--sep-files', '-o', output_dir, tmp_output=False) + self.checkFilesExist(output_dir, ['csvdiff', 'exodiff'], ['runner_run', 'tester']) - if not (os.path.exists(os.path.join(self.output_dir, 'test_harness.always_ok.OK.txt')) - and os.path.exists(os.path.join(self.output_dir, 'test_harness.csvdiff.DIFF.txt')) - and os.path.exists(os.path.join(self.output_dir, 'test_harness.exodiff.DIFF.txt'))): - self.fail('Failed to create all output files --sep-files') + with tempfile.TemporaryDirectory() as output_dir: + self.runTests('--no-color', '-i', 'always_ok', '--sep-files', '-o', output_dir, tmp_output=False) + self.checkFilesExist(output_dir, ['always_ok'], ['runner_run']) From 7c8e10dd2c09d84b4b7c23beaf7b669f68143054 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Sat, 24 Aug 2024 23:28:07 -0600 Subject: [PATCH 224/243] Add runner out to gitgnore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 67ab743aa284..e73ac48094af 100644 --- a/.gitignore +++ b/.gitignore @@ -328,6 +328,7 @@ share/ /tutorials/darcy_thermo_mech/*/darcy_thermo_mech.yaml # test harness --sep-files output +*.runner_run_out.txt *.runner_out.txt *.tester_out.txt *.job_out.txt From 58d7a5c3a3417460a69aeed92ba8eeb74c2049c0 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Sun, 25 Aug 2024 07:51:36 -0600 Subject: [PATCH 225/243] Set results file before looking for its existance --- python/TestHarness/TestHarness.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 28ce3f2fd63a..7830b6ac82fc 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -258,13 +258,6 @@ def __init__(self, argv, moose_dir, app_name=None, moose_python=None): # Parse arguments self.parseCLArgs(argv) - # Setup absolute paths and output paths - if self.options.output_dir: - self.options.output_dir = os.path.abspath(self.options.output_dir) - self.options.results_file = os.path.join(self.options.output_dir, self.options.results_file) - else: - self.options.results_file = os.path.abspath(self.options.results_file) - checks = {} checks['platform'] = util.getPlatforms() checks['machine'] = util.getMachine() @@ -1139,13 +1132,21 @@ def checkAndUpdateCLArgs(self): if opts.spec_file and not os.path.exists(opts.spec_file): print('ERROR: --spec-file supplied but path does not exist') sys.exit(1) - if opts.failed_tests and not os.path.exists(opts.results_file): - print('ERROR: --failed-tests could not detect a previous run') - sys.exit(1) if opts.verbose and opts.quiet: print('Do not be an oxymoron with --verbose and --quiet') sys.exit(1) + # Setup absolute paths and output paths + if opts.output_dir: + opts.output_dir = os.path.abspath(opts.output_dir) + opts.results_file = os.path.join(opts.output_dir, opts.results_file) + else: + opts.results_file = os.path.abspath(opts.results_file) + + if opts.failed_tests and not os.path.exists(opts.results_file): + print('ERROR: --failed-tests could not detect a previous run') + sys.exit(1) + # Update any keys from the environment as necessary if not self.options.method: if 'METHOD' in os.environ: From e7dff71c6d22533bf5d197129db82a7509b7428b Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Tue, 27 Aug 2024 10:09:09 -0600 Subject: [PATCH 226/243] Don't check for warnings first --- modules/thermal_hydraulics/test/tests/utils/logger/tests | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modules/thermal_hydraulics/test/tests/utils/logger/tests b/modules/thermal_hydraulics/test/tests/utils/logger/tests index a87fbe8267f5..5da6d319da5f 100644 --- a/modules/thermal_hydraulics/test/tests/utils/logger/tests +++ b/modules/thermal_hydraulics/test/tests/utils/logger/tests @@ -14,7 +14,9 @@ [errors] type = RunException input = 'test.i' - expect_err = "componentB: warning 2.*componentA: error 1.*componentA: error 2.*componentB: error 1.*componentB: error 2" + # We can't reliably check for the warnings first here because the ordering of the warning + # and the error is MPI implementation dependent + expect_err = "componentA: error 1.*componentA: error 2.*componentB: error 1.*componentB: error 2" allow_test_objects = true allow_warnings = true # Testing that warnings are emitted requirement = 'The system shall be able to output errors in a batch.' From 32193c8bfe6ee865ccff99bd7506e3128fffc133 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 6 Sep 2024 10:30:33 -0600 Subject: [PATCH 227/243] Allow retries when the PBS scheduler is misbehaving --- python/TestHarness/schedulers/RunHPC.py | 33 ++++++++++++++++++++++--- python/TestHarness/schedulers/RunPBS.py | 9 +++++++ 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index db03635446c0..3ec98f40b44f 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -255,18 +255,36 @@ def _callSSH(self, command): full_command = f"ssh {host} '{command}'" return exit_code, result.rstrip(), full_command - def callHPC(self, pool_type, command): + def callHPC(self, pool_type, command: str, num_retries: int = 0, retry_time: float = 5): """ Wrapper for calling a HPC command (qsub, qstat, etc) that supports SSH-ing to another host as needed when calling from within apptainer + Set num_retires to retry the command this many times, waiting + retry_time sec between each retry. The command will only be retried + if self.callHPCShouldRetry() is True for that command. This lets + us retry commands given known failures. + Requires the "pool" to specify which command pool to use, of the RunHPC.CallHPCPoolType types. """ if not self.ssh_hosts: raise Exception('HPC not currently supported outside of a container') - return self.call_hpc_pool[pool_type].apply(self._callSSH, (command,)) + exit_code = None + result = None + full_cmd = None + + for i in range(num_retries + 1): + exit_code, result, full_cmd = self.call_hpc_pool[pool_type].apply(self._callSSH, (command,)) + if exit_code == 0: + break + if self.callHPCShouldRetry(pool_type, result): + time.sleep(retry_time) + else: + break + + return exit_code, result, full_cmd def getJobSlots(self, job): # Jobs only use one slot because they are ran externally @@ -448,7 +466,7 @@ def submitJob(self, job, hold): cmd = '; '.join(cmd) # Do the submission; this is thread safe - exit_code, result, full_cmd = self.callHPC(self.CallHPCPoolType.submit, cmd) + exit_code, result, full_cmd = self.callHPC(self.CallHPCPoolType.submit, cmd, num_retries=5) # Start the queued timer if needed if not hold: @@ -513,7 +531,7 @@ def queueJob(self, job): raise Exception('Job should not be held with holding disabled') cmd = f'{self.getHPCQueueCommand()} {hpc_job.id}' - exit_code, result, full_cmd = self.callHPC(self.CallHPCPoolType.queue, cmd) + exit_code, result, full_cmd = self.callHPC(self.CallHPCPoolType.queue, cmd, num_retries=5) if exit_code != 0: try: self.killHPCJob(hpc_job, lock=False) # already locked @@ -894,3 +912,10 @@ def appendResultFileJob(self, job): entry = {'id': hpc_job.id, 'submission_script': self.getHPCJobSubmissionPath(job)} return {'HPC': entry} + + def callHPCShouldRetry(self, pool_type, result: str): + """ + Entry point for a derived scheduler class to tell us if we can + retry a command given a failure with a certain result. + """ + return False diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index ea7de8abb0d8..6e517e70ed69 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -152,3 +152,12 @@ def parseHPCSubmissionJobID(self, result): if not search: raise Exception(f'qsub has unexpected ID {result}') return result + + def callHPCShouldRetry(self, pool_type, result: str): + # If we're submitting/releasing/getting a status and cannot connect + # to the scheduler, we can retry + if pool_type in [self.CallHPCPoolType.submit, + self.CallHPCPoolType.queue, + self.CallHPCPoolType.status]: + return 'pbs_iff: cannot connect to host' in result + return False From 50030bb1e8cd7f26bd5e0083c6fc47662488d0e1 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 6 Sep 2024 11:57:13 -0600 Subject: [PATCH 228/243] Sanitize all of the output --- python/TestHarness/OutputInterface.py | 73 +++++++++++++++++++++++-- python/TestHarness/runners/HPCRunner.py | 2 +- python/TestHarness/runners/Runner.py | 24 +------- python/TestHarness/schedulers/Job.py | 26 +++++++++ 4 files changed, 98 insertions(+), 27 deletions(-) diff --git a/python/TestHarness/OutputInterface.py b/python/TestHarness/OutputInterface.py index 3447ef5e107f..62d32f1a1166 100644 --- a/python/TestHarness/OutputInterface.py +++ b/python/TestHarness/OutputInterface.py @@ -8,6 +8,7 @@ #* https://www.gnu.org/licenses/lgpl-2.1.html import os +import json class OutputInterface: """ Helper class for writing output to either memory or a file """ @@ -17,6 +18,13 @@ def __init__(self): # The path to write output to, if any self.separate_output_path = None + class BadOutputException(Exception): + """ Exception that is thrown when bad output is detected """ + def __init__(self, errors: list[str]): + self.errors = errors + message = 'Bad output detected: ' + ', '.join(errors) + super().__init__(message) + def setSeparateOutputPath(self, separate_output_path): """ Sets the path for writing output to """ self.separate_output_path = separate_output_path @@ -36,16 +44,32 @@ def hasOutput(self) -> bool: return os.path.isfile(self.separate_output_path) return len(self.output) > 0 - def getOutput(self) -> str: - """ Gets the underlying output, either from file or memory """ + def getOutput(self, sanitize: bool = True) -> str: + """ + Gets the underlying output, either from file or memory + + The sanitize parameter triggers whether or not to check + for bad output, in which case an exception will be thrown + if it is found. The intention here is to sanitize it + ahead of time with self.sanitizeOutput() so that you can + clean it then and appropriately report the error earlier + on before the output is used. + """ + output = '' if self.separate_output_path: try: - return open(self.separate_output_path, 'r').read() + output = open(self.separate_output_path, 'r').read() except FileNotFoundError: pass else: - return self.output - return '' + output = self.output + + if sanitize: + _, sanitize_failures = self._sanitizeOutput(output) + if sanitize_failures: + raise self.BadOutputException(sanitize_failures) + + return output def setOutput(self, output: str): """ Sets the output given some output string """ @@ -72,3 +96,42 @@ def clearOutput(self): os.remove(self.separate_output_path) else: self.output = '' + + @staticmethod + def _sanitizeOutput(output): + """ + Internal method for taking an output string, sanitizing + it if needed, and then returning a list of the failures + that were encountered (if any) + """ + failures = set() + + # Check for invalid characters + try: + json.dumps(output) + except UnicodeDecodeError: + # Convert invalid output to something json can handle + output = output.decode('utf-8','replace').encode('ascii', 'replace') + # Alert the user that output has invalid characters + failures.add('invalid output characters') + + # Check for NULL characters + null_chars = ['\0', '\x00'] + for null_char in null_chars: + if null_char in output: + output = output.replace(null_char, 'NULL') + failures.add('NULL output') + + return output, list(failures) + + def sanitizeOutput(self): + """ + Sanitizes the output in place and returns a list of the + checks that failed, if any. + + Should be called before processing the output. + """ + output, failures = self._sanitizeOutput(self.getOutput(sanitize=False)) + if failures: + self.setOutput(output) + return failures diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index 72111af3be01..7b394c0b743e 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -145,7 +145,7 @@ def print_files(files, type): # character check that happens in Runner.finalize() to still # be valid. if self.exit_code != 0 and self.job.getTester().hasOpenMPI(): - output = self.getRunOutput().getOutput() + output = self.getRunOutput().getOutput(sanitize=False) if output: prefix = '\n' null = '\0' diff --git a/python/TestHarness/runners/Runner.py b/python/TestHarness/runners/Runner.py index d87186826db2..89b0e36d0f32 100644 --- a/python/TestHarness/runners/Runner.py +++ b/python/TestHarness/runners/Runner.py @@ -70,32 +70,14 @@ def finalize(self): # Load the redirected output files, if any run_output = self.getRunOutput() for file_path in self.job.getTester().getRedirectedOutputFiles(self.options): - self.run_output.appendOutput(util.outputHeader(f'Begin redirected output {file_path}')) + run_output.appendOutput(util.outputHeader(f'Begin redirected output {file_path}')) if os.access(file_path, os.R_OK): with open(file_path, 'r+b') as f: - self.run_output.appendOutput(self.readOutput(f)) + run_output.appendOutput(self.readOutput(f)) else: self.job.setStatus(self.job.error, 'FILE TIMEOUT') self.appendOutput(f'File {file_path} unavailable') - self.run_output.appendOutput(util.outputHeader(f'End redirected output {file_path}')) - - # Check for invalid unicode in output - output = self.getOutput() - try: - json.dumps(output) - except UnicodeDecodeError: - # Convert invalid output to something json can handle - self.setOutput(output.decode('utf-8','replace').encode('ascii', 'replace')) - # Alert the user that output has invalid characters - self.job.addCaveats('invalid characters in output') - - # Remove NULL output and fail if it exists - output = self.getOutput() - null_chars = ['\0', '\x00'] - for null_char in null_chars: - if null_char in output: - self.setOutput(output.replace(null_char, 'NULL')) - self.job.setStatus(self.job.error, 'NULL characters in output') + run_output.appendOutput(util.outputHeader(f'End redirected output {file_path}')) def getExitCode(self): """ diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index db3e7e9d0928..6d26b56b3596 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -365,8 +365,12 @@ def run(self): # Helper for exiting def finalize(): + # Run cleanup with self.timer.time('job_cleanup'): self.cleanup() + # Sanitize the output from all objects + self.sanitizeAllOutput() + # Stop timing self.timer.stopMain() # Set the output path if its separate and initialize the output @@ -454,6 +458,13 @@ def try_catch(do, exception_name, timer_name): # And do finalize (really just cleans up output) runner_finalize = lambda: self._runner.finalize() if not try_catch(runner_finalize, 'RUNNER FINALIZE', 'runner_finalize'): + finalize() + return + + # Exit if we have bad output in the runner before running the tester + self.sanitizeAllOutput() + if self.isError(): + finalize() return # Check if the files we checked on earlier were modified. @@ -513,6 +524,21 @@ def getAllOutput(self) -> dict: output[name] = object.getOutput() return output + def sanitizeAllOutput(self): + """ Sanitizes the output from all output objects + + If output is retreived from these objects via getOutput() and + it contains bad output, it will throw an error. Instead of + throwing an error, we will sanitize it before hand and then + set a Job error so that we can still continue in a failed state. + """ + all_failures = [] + for name, object in self.getOutputObjects().items(): + failures = object.sanitizeOutput() + all_failures.extend([s + f' in {name}' for s in failures]) + if all_failures: + self.setStatus(self.error, ', '.join(all_failures)) + def getOutputForScreen(self): """ Gets the output for printing on screen """ show_output = self.options.verbose or (self.isFail() and not self.options.quiet) or self.isError() From a4b922ab6e90c578e84de9d6ab55406fa5335aef Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 6 Sep 2024 14:51:20 -0600 Subject: [PATCH 229/243] Remove type hint because it doesn't work with python 3.8 --- python/TestHarness/OutputInterface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/OutputInterface.py b/python/TestHarness/OutputInterface.py index 62d32f1a1166..b6e54a7b2548 100644 --- a/python/TestHarness/OutputInterface.py +++ b/python/TestHarness/OutputInterface.py @@ -20,7 +20,7 @@ def __init__(self): class BadOutputException(Exception): """ Exception that is thrown when bad output is detected """ - def __init__(self, errors: list[str]): + def __init__(self, errors): self.errors = errors message = 'Bad output detected: ' + ', '.join(errors) super().__init__(message) From 6fd6e92f067d745b39c64e5d39856d041a70b052 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 6 Sep 2024 15:06:51 -0600 Subject: [PATCH 230/243] Match other null character as well --- python/TestHarness/runners/SubprocessRunner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TestHarness/runners/SubprocessRunner.py b/python/TestHarness/runners/SubprocessRunner.py index bca187eeab21..f4a4f9bc8d32 100644 --- a/python/TestHarness/runners/SubprocessRunner.py +++ b/python/TestHarness/runners/SubprocessRunner.py @@ -94,7 +94,7 @@ def wait(self, timer): # it until we figure out what's broken if file == self.errfile and self.exit_code != 0 \ and self.job.getTester().hasOpenMPI() and len(output) > 2 \ - and output[-3] == '\n\0\n': + and output[-3:] in ['\n\0\n', '\n\x00\n']: output = output[:-3] self.getRunOutput().appendOutput(output) From e4294da843afdc12379669dd93c7ea353944f418 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 6 Sep 2024 16:55:54 -0600 Subject: [PATCH 231/243] Add test for OutputInterface object --- .../TestHarness/tests/test_OutputInterface.py | 125 ++++++++++++++++++ python/TestHarness/tests/tests | 6 + 2 files changed, 131 insertions(+) create mode 100644 python/TestHarness/tests/test_OutputInterface.py diff --git a/python/TestHarness/tests/test_OutputInterface.py b/python/TestHarness/tests/test_OutputInterface.py new file mode 100644 index 000000000000..8587d4c84466 --- /dev/null +++ b/python/TestHarness/tests/test_OutputInterface.py @@ -0,0 +1,125 @@ +#* This file is part of the MOOSE framework +#* https://www.mooseframework.org +#* +#* All rights reserved, see COPYRIGHT for full restrictions +#* https://github.com/idaholab/moose/blob/master/COPYRIGHT +#* +#* Licensed under LGPL 2.1, please see LICENSE for details +#* https://www.gnu.org/licenses/lgpl-2.1.html + +import unittest +import tempfile +import os + +from TestHarness import OutputInterface + +class TestHarnessTester(unittest.TestCase): + def testInMemory(self): + # Empty state + oi = OutputInterface() + self.assertIsNone(oi.getSeparateOutputFilePath()) + self.assertFalse(oi.hasOutput()) + self.assertEqual(oi.output, '') + self.assertEqual(oi.getOutput(), '') + + # Add output + output = 'foo' + oi.setOutput(output) + self.assertIsNone(oi.getSeparateOutputFilePath()) + self.assertTrue(oi.hasOutput()) + self.assertEqual(oi.getOutput(), output) + + # Clear output + oi.clearOutput() + self.assertFalse(oi.hasOutput()) + + # Append output empty + output = 'bar' + oi.appendOutput(output) + self.assertTrue(oi.hasOutput()) + self.assertEqual(oi.getOutput(), output) + + # Append more + oi.appendOutput(output) + self.assertTrue(oi.hasOutput()) + self.assertEqual(oi.getOutput(), output + output) + + # Reset + output = 'foo' + oi.setOutput(output) + self.assertTrue(oi.hasOutput()) + self.assertEqual(oi.getOutput(), output) + # And then append + for i in range(2): + oi.appendOutput(output) + self.assertTrue(oi.hasOutput()) + self.assertEqual(oi.getOutput(), output * 3) + + def testSeparate(self): + with tempfile.TemporaryDirectory() as dir: + output_file = os.path.join(dir, 'output') + + # Empty state + oi = OutputInterface() + oi.setSeparateOutputPath(output_file) + self.assertEqual(oi.getSeparateOutputFilePath(), output_file) + self.assertFalse(os.path.exists(output_file)) + self.assertFalse(oi.hasOutput()) + self.assertEqual(oi.output, '') + self.assertEqual(oi.getOutput(), '') + + # Add output + output = 'foo' + oi.setOutput(output) + self.assertTrue(os.path.exists(output_file)) + self.assertTrue(oi.hasOutput()) + self.assertEqual(oi.getOutput(), output) + + # Clear output + oi.clearOutput() + self.assertFalse(os.path.exists(output_file)) + self.assertFalse(oi.hasOutput()) + + # Append output empty + output = 'bar' + oi.appendOutput(output) + self.assertTrue(os.path.exists(output_file)) + self.assertTrue(oi.hasOutput()) + self.assertEqual(oi.getOutput(), output) + + # Append more + oi.appendOutput(output) + self.assertTrue(oi.hasOutput()) + self.assertTrue(os.path.exists(output_file)) + self.assertEqual(oi.getOutput(), output + output) + + # Reset + output = 'foo' + oi.setOutput(output) + self.assertTrue(os.path.exists(output_file)) + self.assertTrue(oi.hasOutput()) + self.assertEqual(oi.getOutput(), output) + # And then append + for i in range(2): + oi.appendOutput(output) + self.assertTrue(oi.hasOutput()) + self.assertEqual(oi.getOutput(), output * 3) + + def testBadOutput(self): + oi = OutputInterface() + + null_chars = 'foobar\nwith a dirty \0and another dirty\x00' + null_replaced = null_chars.replace('\0', 'NULL').replace('\x00', 'NULL') + + # Set null characters + oi.setOutput(null_chars) + failures = oi.sanitizeOutput() + self.assertEqual(failures, ['NULL output']) + self.assertEqual(oi.getOutput(), null_replaced) + + # Set null characters without sanitize + oi.setOutput(null_chars) + with self.assertRaises(OutputInterface.BadOutputException) as e: + oi.getOutput() + self.assertEqual(e.exception.errors, ['NULL output']) + self.assertEqual(str(e.exception), 'Bad output detected: NULL output') diff --git a/python/TestHarness/tests/tests b/python/TestHarness/tests/tests index 69226d40ecf5..794b54b3698d 100644 --- a/python/TestHarness/tests/tests +++ b/python/TestHarness/tests/tests @@ -281,4 +281,10 @@ requirement = "The system shall not skip non-heavy tests for which heavy tests depend on" issues = '#26215' [] + [test_output_interface] + type = PythonUnitTest + input = test_OutputInterface.py + requirement = 'The system shall provide a common interface for storing and retrieving output that supports sanitization + issues = '#27562' + [] [] From 1af09b9a3cdad67688cbea5eff5f5231173f2b52 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 6 Sep 2024 17:26:21 -0600 Subject: [PATCH 232/243] Also match \x00 for openmpi --- python/TestHarness/runners/HPCRunner.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index 7b394c0b743e..6bf2055ba342 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -140,20 +140,20 @@ def print_files(files, type): # Handle openmpi appending a null character at the end of jobs # that return a nonzero exit code. We don't know how to fix this # in openmpi yet, so this is the cleanest way to take care of it. - # We're looking for \n\0##########', which is at the end of the + # We're looking for \n[\0,\x00]##########', which is at the end of the # apptainer execution within hpc_template. This allows the null # character check that happens in Runner.finalize() to still # be valid. if self.exit_code != 0 and self.job.getTester().hasOpenMPI(): output = self.getRunOutput().getOutput(sanitize=False) if output: - prefix = '\n' - null = '\0' - suffix = '##########' - all = f'{prefix}{null}{suffix}' - no_null = f'{prefix}{suffix}' - if all in output: - self.getRunOutput().setOutput(output.replace(all, no_null)) + for null in ['\0', '\x00']: + prefix = '\n' + suffix = '##########' + all = f'{prefix}{null}{suffix}' + no_null = f'{prefix}{suffix}' + if all in output: + self.getRunOutput().setOutput(output.replace(all, no_null)) def kill(self): if self.hpc_job: From 9aeb3e2f5d52b2e72b26d89e352c496b5b859da7 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 6 Sep 2024 17:58:09 -0600 Subject: [PATCH 233/243] Forcefully flush output for job status --- python/TestHarness/TestHarness.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 7830b6ac82fc..e37b7469f824 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -623,7 +623,7 @@ def handleJobStatus(self, job, caveats=None): # Print status with caveats (if caveats not overridden) caveats = True if caveats is None else caveats - print((util.formatResult(job, self.options, caveats=caveats))) + print(util.formatResult(job, self.options, caveats=caveats), flush=True) timing = job.getTiming() @@ -643,7 +643,7 @@ def handleJobStatus(self, job, caveats=None): # Just print current status without saving results else: caveats = False if caveats is None else caveats - print((util.formatResult(job, self.options, result=job.getStatus().status, caveats=caveats))) + print(util.formatResult(job, self.options, result=job.getStatus().status, caveats=caveats), flush=True) # Print final results, close open files, and exit with the correct error code def cleanup(self): From d4c7a79f04c3cd4b1aee07a232a2ade05d8aa4d0 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 13 Sep 2024 11:13:14 -0600 Subject: [PATCH 234/243] Generalize the NULL output checking for openmpi on HPC --- python/TestHarness/runners/HPCRunner.py | 46 +++++++++++++++++-------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py index 6bf2055ba342..86015cf2964d 100644 --- a/python/TestHarness/runners/HPCRunner.py +++ b/python/TestHarness/runners/HPCRunner.py @@ -138,22 +138,40 @@ def print_files(files, type): timer.stop('runner_run', end_time) # Handle openmpi appending a null character at the end of jobs - # that return a nonzero exit code. We don't know how to fix this - # in openmpi yet, so this is the cleanest way to take care of it. - # We're looking for \n[\0,\x00]##########', which is at the end of the - # apptainer execution within hpc_template. This allows the null - # character check that happens in Runner.finalize() to still - # be valid. + # that return a nonzero exit code. An example of this is: + # + # -------------------------------------------------------------------------- + # MPI_ABORT was invoked on rank 0 in communicator MPI_COMM_WORLD + # Proc: [[PID,1],0] + # Errorcode: 1 + # + # NOTE: invoking MPI_ABORT causes Open MPI to kill all MPI processes. + # You may or may not see output from other processes, depending on + # exactly when Open MPI kills them. + # -------------------------------------------------------------------------- + # -------------------------------------------------------------------------- + # prterun has exited due to process rank 0 with PID 0 on node HOSTNAME calling + # "abort". This may have caused other processes in the application to be + # terminated by signals sent by prterun (as reported here). + # -------------------------------------------------------------------------- + # + # + # Where is there the null character ends up. Thus, in cases + # where we have a nonzero exit code and a MPI_ABORT, we'll try to remove these. if self.exit_code != 0 and self.job.getTester().hasOpenMPI(): output = self.getRunOutput().getOutput(sanitize=False) - if output: - for null in ['\0', '\x00']: - prefix = '\n' - suffix = '##########' - all = f'{prefix}{null}{suffix}' - no_null = f'{prefix}{suffix}' - if all in output: - self.getRunOutput().setOutput(output.replace(all, no_null)) + if 'MPI_ABORT' in output: + output_changed = False + if output: + for null in ['\0', '\x00']: + prefix = '-'*74 + '\n' + prefix_with_null = prefix + null + if prefix_with_null in output: + output = output.replace(prefix_with_null, prefix, 1) + output_changed = True + if output_changed: + self.getRunOutput().setOutput(output) + def kill(self): if self.hpc_job: From 97a2175ff02b8f00b0dcffbef71609f058c6d085 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 13 Sep 2024 15:12:31 -0600 Subject: [PATCH 235/243] Make all of the keys lower case --- python/TestHarness/TestHarness.py | 28 ++++++++-------- python/TestHarness/schedulers/Job.py | 32 +++++++++---------- python/TestHarness/testers/Tester.py | 8 ++--- python/TestHarness/tests/test_WriteResults.py | 4 +-- 4 files changed, 36 insertions(+), 36 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index e37b7469f824..175a844fabb4 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -836,7 +836,7 @@ def initializeResults(self): print(f'ERROR: Failed to load result {self.options.results_file}') raise - if self.options.results_storage['INCOMPLETE']: + if self.options.results_storage['incomplete']: print(f'ERROR: The previous result {self.options.results_file} is incomplete!') sys.exit(1) @@ -844,7 +844,7 @@ def initializeResults(self): _input_file_name = 'tests' if self.options.input_file_name: _input_file_name = self.options.input_file_name - self.options.input_file_name = self.options.results_storage.get('INPUT_FILE_NAME', _input_file_name) + self.options.input_file_name = self.options.results_storage.get('input_file_name', _input_file_name) # Done working with existing storage return @@ -858,32 +858,32 @@ def initializeResults(self): storage = self.options.results_storage # Record the input file name that was used - storage['INPUT_FILE_NAME'] = self.options.input_file_name + storage['input_file_name'] = self.options.input_file_name # Record that we are using --sep-files - storage['SEP_FILES'] = self.options.sep_files + storage['sep_files'] = self.options.sep_files # Record the Scheduler Plugin used - storage['SCHEDULER'] = self.scheduler.__class__.__name__ + storage['scheduler'] = self.scheduler.__class__.__name__ # Record information on the host we can ran on - storage['HOSTNAME'] = socket.gethostname() - storage['USER'] = getpass.getuser() - storage['TESTHARNESS_PATH'] = os.path.abspath(os.path.join(os.path.abspath(__file__), '..')) - storage['TESTHARNESS_ARGS'] = sys.argv[1:] - storage['MOOSE_DIR'] = self.moose_dir + storage['hostname'] = socket.gethostname() + storage['user'] = getpass.getuser() + storage['testharness_path'] = os.path.abspath(os.path.join(os.path.abspath(__file__), '..')) + storage['testharness_args'] = sys.argv[1:] + storage['moose_dir'] = self.moose_dir # Record when the run began - storage['TIME'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + storage['time'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # Record any additional data from the scheduler storage.update(self.scheduler.appendResultFileHeader()) # Record whether or not the storage is incomplete - storage['INCOMPLETE'] = True + storage['incomplete'] = True # Empty storage for the tests - storage['TESTS'] = {} + storage['tests'] = {} # Write the headers self.writeResults() @@ -898,7 +898,7 @@ def writeResults(self, complete=False): raise Exception('Should not write results') # Make it as complete (run is done) - self.options.results_storage['INCOMPLETE'] = not complete + self.options.results_storage['incomplete'] = not complete # Store to a temporary file so that we always have a working file file = self.options.results_file diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 6d26b56b3596..61f44a2a9659 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -759,21 +759,21 @@ def storeResults(self, scheduler): joint_status = self.getJointStatus() # Base job data - job_data = {'NAME' : self.getTestNameShort(), - 'LONG_NAME' : self.getTestName(), - 'TIMING' : self.timer.totalTimes(), - 'STATUS' : joint_status.status, - 'STATUS_MESSAGE' : joint_status.message, - 'FAIL' : self.isFail(), - 'COLOR' : joint_status.color, - 'CAVEATS' : list(self.getCaveats()), - 'TESTER_OUTPUT_FILES' : self.getOutputFiles(self.options), - 'INPUT_FILE' : self.getInputFile(), - 'COMMAND' : self.getCommand()} + job_data = {'name' : self.getTestNameShort(), + 'long_name' : self.getTestName(), + 'timing' : self.timer.totalTimes(), + 'status' : joint_status.status, + 'status_message' : joint_status.message, + 'fail' : self.isFail(), + 'color' : joint_status.color, + 'caveats' : list(self.getCaveats()), + 'tester_output_files' : self.getOutputFiles(self.options), + 'input_file' : self.getInputFile(), + 'command' : self.getCommand()} if self.hasSeperateOutput(): - job_data['OUTPUT_FILES'] = self.getCombinedSeparateOutputPaths() + job_data['output_files'] = self.getCombinedSeparateOutputPaths() else: - job_data['OUTPUT'] = self.getAllOutput() + job_data['output'] = self.getAllOutput() # Extend with data from the scheduler, if any job_data.update(scheduler.appendResultFileJob(self)) @@ -799,13 +799,13 @@ def loadPreviousResults(self): # Set the previous times self.timer.reset() time_now = Timer.time_now() - for name, total_time in test_entry['TIMING'].items(): + for name, total_time in test_entry['timing'].items(): self.timer.start(name, time_now) self.timer.stop(name, time_now + total_time) # Load the output - output_files = test_entry.get('OUTPUT_FILES') - output = test_entry.get('OUTPUT') + output_files = test_entry.get('output_files') + output = test_entry.get('output') for name, object in self.getOutputObjects().items(): if output_files: # --sep-files object.setSeparateOutputPath(output_files[name]) diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index 2d1841a76b20..daae44a426b4 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -178,7 +178,7 @@ def createStatus(self): def getResultsEntry(self, options, create, graceful=False): """ Get the entry in the results storage for this tester """ - tests = options.results_storage['TESTS'] + tests = options.results_storage['tests'] test_dir = self.getTestDir() test_dir_entry = tests.get(test_dir) @@ -206,9 +206,9 @@ def previousTesterStatus(self, options): test_dir_entry, test_entry = self.getResultsEntry(options, False, True) status = (self.test_status.createStatus(), '', '') if test_entry: - status = (self.test_status.createStatus(str(test_entry['STATUS'])), - str(test_entry['STATUS_MESSAGE']), - test_entry['CAVEATS']) + status = (self.test_status.createStatus(str(test_entry['status'])), + str(test_entry['status_message']), + test_entry['caveats']) return (status) def getStatusMessage(self): diff --git a/python/TestHarness/tests/test_WriteResults.py b/python/TestHarness/tests/test_WriteResults.py index f105c130e6ad..c1a9493edf1c 100644 --- a/python/TestHarness/tests/test_WriteResults.py +++ b/python/TestHarness/tests/test_WriteResults.py @@ -28,7 +28,7 @@ def checkFilesExist(self, output_dir, tests, output_object_names): # Load the previous results with open(os.path.join(output_dir, '.previous_test_results.json')) as f: results = json.load(f) - test_results = results['TESTS'] + test_results = results['tests'] # We should only have one test spec self.assertEqual(1, len(test_results)) # The test spec should be in the results @@ -43,7 +43,7 @@ def checkFilesExist(self, output_dir, tests, output_object_names): self.assertIn(test_name_short, test_spec_results) test_results = test_spec_results[test_name_short] # Get the output files from the test spec - result_output_files = test_results['OUTPUT_FILES'] + result_output_files = test_results['output_files'] # Make sure each output file exists and is set in the results file for name in output_object_names: output_path = f'{output_base_path}/{test}.{name}_out.txt' From ffbab17b6c3b718807fae300ca5450bed3118d10 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 13 Sep 2024 16:22:36 -0600 Subject: [PATCH 236/243] Add apptainer information to results --- python/TestHarness/TestHarness.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 175a844fabb4..a4f5ff95d617 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -873,6 +873,18 @@ def initializeResults(self): storage['testharness_args'] = sys.argv[1:] storage['moose_dir'] = self.moose_dir + # Record information from apptainer, if any + apptainer_container = os.environ.get('APPTAINER_CONTAINER') + if apptainer_container: + apptainer = {'path': apptainer_container} + # Information from ApptainerGenerator generated containers + var_prefix = 'MOOSE_APPTAINER_GENERATOR' + generator_name = os.environ.get(f'{var_prefix}_NAME') + if generator_name: + for suffix in ['LIBRARY', 'NAME', 'TAG', 'VERSION']: + apptainer[f'generator_{suffix.lower()}'] = os.environ.get(f'{var_prefix}_{suffix}') + storage['apptainer'] = apptainer + # Record when the run began storage['time'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') From c4abdb41898acc5d5b80c85698ca7d05da6a56f6 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 13 Sep 2024 16:23:10 -0600 Subject: [PATCH 237/243] Add more Tester context to the results file --- python/TestHarness/schedulers/Job.py | 9 ++------- python/TestHarness/testers/RunApp.py | 2 +- python/TestHarness/testers/Tester.py | 10 ++++++++++ 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 61f44a2a9659..d17a80fcfda1 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -317,10 +317,7 @@ def getMaxTime(self): def getInputFile(self): """ Wrapper method to return input filename """ - input_file = self.__tester.getInputFile() - if input_file: - return os.path.join(self.getTestDir(), input_file) - return None + return self.__tester.getInputFile() def getInputFileContents(self): """ Wrapper method to return input file contents """ @@ -767,9 +764,7 @@ def storeResults(self, scheduler): 'fail' : self.isFail(), 'color' : joint_status.color, 'caveats' : list(self.getCaveats()), - 'tester_output_files' : self.getOutputFiles(self.options), - 'input_file' : self.getInputFile(), - 'command' : self.getCommand()} + 'tester' : self.getTester().getResults(self.options)} if self.hasSeperateOutput(): job_data['output_files'] = self.getCombinedSeparateOutputPaths() else: diff --git a/python/TestHarness/testers/RunApp.py b/python/TestHarness/testers/RunApp.py index 86fb28182be3..b3c8d41f78e0 100644 --- a/python/TestHarness/testers/RunApp.py +++ b/python/TestHarness/testers/RunApp.py @@ -79,7 +79,7 @@ def __init__(self, name, params): def getInputFile(self): if self.specs.isValid('input'): - return self.specs['input'].strip() + return os.path.join(self.getTestDir(), self.specs['input'].strip()) else: return None # Not all testers that inherit from RunApp have an input file diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index daae44a426b4..2adbc330483b 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -211,6 +211,16 @@ def previousTesterStatus(self, options): test_entry['caveats']) return (status) + def getResults(self, options) -> dict: + """Get the results dict for this Tester""" + output_files = [] + for file in self.getOutputFiles(options): + output_files.append(os.path.join(self.getTestDir(), file)) + return {'name': self.__class__.__name__, + 'command': self.getCommand(options), + 'input_file': self.getInputFile(), + 'output_files': output_files} + def getStatusMessage(self): return self.__tester_message From 197c5fe6ed615a2cd924258475052342f3378788 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 13 Sep 2024 19:27:11 -0600 Subject: [PATCH 238/243] Correct the decoded command print for openmpi --- python/TestHarness/schedulers/hpc_run.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/hpc_run.py b/python/TestHarness/schedulers/hpc_run.py index d0f9a68a450c..025a70ec8ce4 100755 --- a/python/TestHarness/schedulers/hpc_run.py +++ b/python/TestHarness/schedulers/hpc_run.py @@ -26,7 +26,10 @@ command = shlex.split(urllib.parse.unquote(encoded_command)) # Try to only print this on rank 0 - if os.environ.get('PMI_RANK', '0') == '0': + rank = os.environ.get('PMI_RANK') # mpich + if rank is None: + rank = os.environ.get('OMPI_COMM_WORLD_RANK') # openmpi + if rank == '0' or rank is None: print('Running decoded command:', ' '.join(command), flush=True) # Run the thing; close_fds=False needed for MPI From 852d1bcc6bd39e7223c1a19e0ea90336f988d7eb Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 13 Sep 2024 19:27:19 -0600 Subject: [PATCH 239/243] Make HPC key lowercase --- python/TestHarness/schedulers/RunHPC.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 3ec98f40b44f..44de8319989a 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -903,15 +903,15 @@ def appendResultFileHeader(self): 'pre_source_file': self.options.hpc_pre_source, 'pre_source': self.source_contents, 'hosts': self.options.hpc_host if isinstance(self.options.hpc_host, list) else [self.options.hpc_host]} - return {'HPC': entry} + return {'hpc': entry} def appendResultFileJob(self, job): hpc_job = self.hpc_jobs.get(job.getID()) if not hpc_job: - return {'HPC': None} + return {'hpc': None} entry = {'id': hpc_job.id, 'submission_script': self.getHPCJobSubmissionPath(job)} - return {'HPC': entry} + return {'hpc': entry} def callHPCShouldRetry(self, pool_type, result: str): """ From 84a3034467aa1ae13c4327aabcfdd31e90e971ea Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 13 Sep 2024 19:57:19 -0600 Subject: [PATCH 240/243] Show the output file path with --sep-files --- python/TestHarness/schedulers/Job.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index d17a80fcfda1..4100e74055ac 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -573,7 +573,8 @@ def getOutputForScreen(self): if name == 'runner_run': # Don't output the runner run if skip_runner_run: - output += f'\nSkipping runner_run output due to {skip_runner_run}\n' + output += f'\nSkipping runner_run output due to {skip_runner_run}; output located at:\n' + output += object.getSeparateOutputFilePath() + '\n' continue # Default trimmed output size From 2ea04696f538b8d76623178befe104fb3c44e4d8 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 23 Oct 2024 09:01:00 -0600 Subject: [PATCH 241/243] Setup a consistent tmpdir for each job and relocate .apptainer for sharens --- python/TestHarness/schedulers/hpc_template | 23 ++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/python/TestHarness/schedulers/hpc_template b/python/TestHarness/schedulers/hpc_template index 3bb858be12d3..730ff8e66184 100644 --- a/python/TestHarness/schedulers/hpc_template +++ b/python/TestHarness/schedulers/hpc_template @@ -59,6 +59,25 @@ cd {{ CWD }} # Make a temp file to store the time time_output=$(mktemp) +# Make a temporary directory that's shared for this job. Because slurm doesn't +# make a tmpdir by default, this gets us a consistent tmpdir across all schedulers +{%- if SCHEDULER_NAME == "pbs" %} +NUM_NODES=$(sort $PBS_NODEFILE | uniq -c | wc -l) +{%- else %} +NUM_NODES=${SLURM_JOB_NUM_NODES} +{%- endif %} +JOB_TMPDIR="$(mktemp -d -u --suffix _${{ JOB_ID_VARIABLE }})" +# This ONLY works for openmpi right now; -N needs to be -ppn for mpich +mpiexec -n ${NUM_NODES} -N 1 mkdir ${JOB_TMPDIR} +export TMPDIR="${JOB_TMPDIR}" + +# If we're using a run with APPTAINER_SHARENS, we really don't want to use /home +# as a location for storing instance state as it can be very fickle. So, use a +# temprorary one +if [ "$APPTAINER_SHARENS" == "1" ]; then + export APPTAINER_CONFIGDIR="${JOB_TMPDIR}/.apptainer" +fi + # Don't exit on failure: need to capture the actual run's return code set +e # Run the command, wrapped in time so that we can capture the real runtime @@ -114,6 +133,10 @@ done # that we have obtained all of the output printf "{{ ENDING_COMMENT }}" +# Clean up the job tempdir +set +e +mpiexec -n ${NUM_NODES} -N 1 rm -rf ${JOB_TMPDIR} + # Exit with the real return code from the job that we ran exit $return_code From fb9227f81f9dc4d4d32546d1195c307dd4b222c7 Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Wed, 23 Oct 2024 16:20:03 -0600 Subject: [PATCH 242/243] Add support for resubmission due to JOB_EXEC_FAIL2 lemhi issue --- python/TestHarness/schedulers/Job.py | 4 ++ python/TestHarness/schedulers/RunHPC.py | 69 ++++++++++++++++++++----- python/TestHarness/schedulers/RunPBS.py | 6 +++ python/TestHarness/testers/Tester.py | 4 ++ 4 files changed, 71 insertions(+), 12 deletions(-) diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 4100e74055ac..8dfa1b3c383b 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -284,6 +284,10 @@ def addCaveats(self, kwargs): """ Wrapper method for setting caveats """ return self.__tester.addCaveats(kwargs) + def removeCaveat(self, caveat): + """ Wrapper method for removing caveats """ + return self.__tester.removeCaveat(caveat) + def getCaveats(self): """ Wrapper method for getting caveats """ return self.__tester.getCaveats() diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 44de8319989a..2f5cdb005d0a 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -13,10 +13,10 @@ from enum import Enum import paramiko import jinja2 -import copy import statistics +import contextlib from multiprocessing.pool import ThreadPool -from TestHarness import OutputInterface, util +from TestHarness import util class HPCJob: # The valid job states for a HPC job @@ -36,6 +36,8 @@ def __init__(self, job): self.state = self.State.waiting # The exit code of the command that was ran (if any) self.exit_code = None + # The number of times the job has been resubmitted + self.num_resubmit = 0 # Lock for accessing this object self.lock = threading.Lock() @@ -64,6 +66,17 @@ def isKilled(self): """ return self.getState() == self.State.killed + def reset(self): + """ + Resets the job state + + Not thread safe; should be called within a lock + """ + self.id = None + self.command = None + self.state = self.State.waiting + self.exit_code = None + class RunHPC(RunParallel): # The types for the pools for calling HPC commands CallHPCPoolType = Enum('CallHPCPoolType', ['submit', 'queue', 'status', 'kill']) @@ -294,13 +307,42 @@ def availableSlots(self, params): # Support managing 250 HPC jobs concurrently return 250, False - def submitJob(self, job, hold): + @staticmethod + def jobCaveat(hpc_job) -> str: + """ + Gets the caveat associated with the job ID for a HPCJob + """ + job_id = hpc_job.id + assert job_id is not None + return f'job={job_id}' if job_id.isdigit() else job_id + + def resubmitHPCJob(self, hpc_job): + """ + Resumits the given HPCJob. + + The HPCJob must have already been submitted. + + This should be called from within the derived + scheduler to resubmit. + """ + assert hpc_job.state != hpc_job.State.waiting + job = hpc_job.job + job.removeCaveat(self.jobCaveat(hpc_job)) + hpc_job.job.addCaveats('resubmitted') + hpc_job.reset() + hpc_job.num_resubmit += 1 + self.submitJob(job, False, lock=False) + + def submitJob(self, job, hold, lock=True): """ Method for submitting an HPC job for the given Job. The "hold" flag specifies whether or not to submit the job in a held state. + Set lock=False if calling this within a method + whether the HPC job lock is already obtained. + Returns the resulting HPCJob. """ # If we're submitting this Job to be held, but the Job status isn't @@ -316,10 +358,12 @@ def submitJob(self, job, hold): # Job hasn't been recorded yet; set up with a waiting state if hpc_job is None: + assert lock is True self.hpc_jobs[job.getID()] = HPCJob(job) hpc_job = self.hpc_jobs.get(job.getID()) - with hpc_job.getLock(): + hpc_job_lock = hpc_job.getLock() if lock else contextlib.nullcontext() + with hpc_job_lock: # Job has already been submitted if hpc_job.state != hpc_job.State.waiting: return hpc_job @@ -480,22 +524,20 @@ def submitJob(self, job, hold): if exit_code != 0: raise self.CallHPCException(self, f'{submission_command} failed', full_cmd, result) - # Parse the job ID from the command - job_id = self.parseHPCSubmissionJobID(result) + # Set the HPC job state + hpc_job.id = self.parseHPCSubmissionJobID(result) + hpc_job.command = job_command + hpc_job.state = hpc_job.State.held if hold else hpc_job.State.queued # Job has been submitted, so set it as queued # Here we append job_id if the ID is just a number so that it's more # obvious what it is - job.addCaveats(f'job={job_id}' if job_id.isdigit() else job_id) + job.addCaveats(self.jobCaveat(hpc_job)) # Print the job as it's been submitted job_status = job.hold if hold else job.queued self.setAndOutputJobStatus(hpc_job.job, job_status, caveats=True) - hpc_job.id = job_id - hpc_job.command = job_command - hpc_job.state = hpc_job.State.held if hold else hpc_job.State.queued - return hpc_job def queueJob(self, job): @@ -883,9 +925,11 @@ def appendResultFooter(self): times = {} for key in timer_keys: times[key] = [] + num_resubmit = 0 for hpc_job in self.hpc_jobs.values(): timer = hpc_job.job.timer + num_resubmit += hpc_job.num_resubmit for key in timer_keys: if timer.hasTotalTime(key): times[key].append(timer.totalTime(key)) @@ -895,7 +939,8 @@ def appendResultFooter(self): averages[key] = statistics.mean(values) if values else 0 result = f'Average queue time {averages["hpc_queued"]:.1f} seconds, ' - result += f'average output wait time {averages["hpc_wait_output"]:.1f} seconds.' + result += f'average output wait time {averages["hpc_wait_output"]:.1f} seconds, ' + result += f'{num_resubmit} jobs resubmitted.' return result def appendResultFileHeader(self): diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 6e517e70ed69..06e09ae3db9f 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -84,6 +84,12 @@ def parse_time(name): if exit_code is not None: if exit_code < 0: name, reason = PBS_User_EXITCODES.get(exit_code, ('TERMINATED', 'Unknown reason')) + # Job failed to start outside of our submission script, so + # try it again a few times. This was implemented due to a + # common issue on lemhi + if name == 'JOB_EXEC_FAIL2' and hpc_job.num_resubmit <= 5: + self.resubmitHPCJob(hpc_job) + continue # Job timed out; give this a special timeout status because # it is then marked as recoverable (could try running again) if name == 'JOB_EXEC_KILL_WALLTIME': diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index 2adbc330483b..3698b5ca4c0f 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -453,6 +453,10 @@ def addCaveats(self, *kwargs): self.__caveats.add(i) return self.getCaveats() + def removeCaveat(self, caveat): + """ Removes a caveat, which _must_ exist """ + self.__caveats.remove(caveat) + def getCaveats(self): """ Return caveats accumalted by this tester """ return self.__caveats From 345144fe829468f46ac382b2b20b6e7eb73a645c Mon Sep 17 00:00:00 2001 From: Logan Harbour Date: Fri, 25 Oct 2024 12:59:43 -0600 Subject: [PATCH 243/243] Store stats in the previous results --- python/TestHarness/TestHarness.py | 63 ++++++++++++++-------- python/TestHarness/schedulers/RunHPC.py | 15 ++++-- python/TestHarness/schedulers/Scheduler.py | 6 ++- 3 files changed, 57 insertions(+), 27 deletions(-) diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index a4f5ff95d617..28a531475329 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -645,6 +645,28 @@ def handleJobStatus(self, job, caveats=None): caveats = False if caveats is None else caveats print(util.formatResult(job, self.options, result=job.getStatus().status, caveats=caveats), flush=True) + def getStats(self, time_total: float) -> dict: + """ + Get cumulative stats for all runs + """ + num_nonzero_timing = sum(1 if float(tup[0].getTiming()) > 0 else 0 for tup in self.test_table) + if num_nonzero_timing > 0: + time_max = max(float(tup[0].getTiming()) for tup in self.test_table) + time_average = sum(float(tup[0].getTiming()) for tup in self.test_table) / num_nonzero_timing + else: + time_max = 0 + time_average = 0 + + stats = {'num_passed': self.num_passed, + 'num_failed': self.num_failed, + 'num_skipped': self.num_skipped, + 'num_total': self.num_passed + self.num_failed + self.num_skipped, + 'time_total': time_total, + 'time_max': time_max, + 'time_average': time_average} + stats.update(self.scheduler.appendStats()) + return stats + # Print final results, close open files, and exit with the correct error code def cleanup(self): # Print the results table again if a bunch of output was spewed to the screen between @@ -659,7 +681,8 @@ def cleanup(self): for (job, sort_value, timing) in sorted(self.test_table, key=lambda x: x[1]): print((util.formatResult(job, self.options, caveats=True))) - time = (datetime.datetime.now() - self.start_time).total_seconds() + time_total = (datetime.datetime.now() - self.start_time).total_seconds() + stats = self.getStats(time_total) print(('-' * (self.options.term_cols))) @@ -671,13 +694,10 @@ def cleanup(self): # Print a different footer when performing a dry run if self.options.dry_run: - print(('Processed %d tests in %.1f seconds.' % (self.num_passed+self.num_skipped, time))) - summary = '%d would run' - summary += ', %d would be skipped' + print(f'Processed {self.num_passed + self.num_skipped} tests in {stats["time_total"]:.1f} seconds.') + summary = f'{self.num_passed} would run, {self.num_skipped} would be skipped' summary += fatal_error - print((util.colorText( summary % (self.num_passed, self.num_skipped), "", html = True, \ - colored=self.options.colored, code=self.options.code ))) - + print(util.colorText(summary, "", html=True, colored=self.options.colored, code=self.options.code)) else: num_nonzero_timing = sum(1 if float(tup[0].getTiming()) > 0 else 0 for tup in self.test_table) if num_nonzero_timing > 0: @@ -686,37 +706,34 @@ def cleanup(self): else: timing_max = 0 timing_avg = 0 - summary = f'Ran {self.num_passed + self.num_failed} tests in {time:.1f} seconds.' + summary = f'Ran {self.num_passed + self.num_failed} tests in {stats["time_total"]:.1f} seconds.' summary += f' Average test time {timing_avg:.1f} seconds,' summary += f' maximum test time {timing_max:.1f} seconds.' print(summary) # Get additional results from the scheduler - scheduler_summary = self.scheduler.appendResultFooter() + scheduler_summary = self.scheduler.appendResultFooter(stats) if scheduler_summary: print(scheduler_summary) if self.num_passed: - summary = '%d passed' + summary = f'{self.num_passed} passed' else: - summary = '%d passed' - summary += ', %d skipped' + summary = f'{self.num_passed} passed' + summary += f', {self.num_skipped} skipped' if self.num_pending: - summary += ', %d pending' - else: - summary += ', %d pending' + summary += f', {self.num_pending} pending' if self.num_failed: - summary += ', %d FAILED' + summary += f', {self.num_failed} FAILED' else: - summary += ', %d failed' + summary += f', {self.num_failed} failed' if self.scheduler.maxFailures(): self.error_code = self.error_code | 0x80 summary += '\nMAX FAILURES REACHED' summary += fatal_error - print((util.colorText( summary % (self.num_passed, self.num_skipped, self.num_pending, self.num_failed), "", html = True, \ - colored=self.options.colored, code=self.options.code ))) + print(util.colorText(summary, "", html=True, colored=self.options.colored, code=self.options.code)) if self.options.longest_jobs: # Sort all jobs by run time @@ -780,8 +797,8 @@ def cleanup(self): for job in job_group: job.storeResults(self.scheduler) - # And write the results - self.writeResults(complete=True) + # And write the results, including the stats + self.writeResults(complete=True, stats=stats) try: # Write one file, with verbose information (--file) @@ -900,7 +917,7 @@ def initializeResults(self): # Write the headers self.writeResults() - def writeResults(self, complete=False): + def writeResults(self, complete=False, stats=None): """ Forcefully write the current results to file Will not do anything if using existing storage. @@ -911,6 +928,8 @@ def writeResults(self, complete=False): # Make it as complete (run is done) self.options.results_storage['incomplete'] = not complete + # Store the stats + self.options.results_storage['stats'] = stats # Store to a temporary file so that we always have a working file file = self.options.results_file diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py index 2f5cdb005d0a..b8530b36e932 100644 --- a/python/TestHarness/schedulers/RunHPC.py +++ b/python/TestHarness/schedulers/RunHPC.py @@ -920,7 +920,7 @@ def waitFinish(self): functor = lambda hpc_job: hpc_job.state == hpc_job.State.held self.killHPCJobs(functor) - def appendResultFooter(self): + def appendStats(self): timer_keys = ['hpc_queued', 'hpc_wait_output'] times = {} for key in timer_keys: @@ -938,9 +938,16 @@ def appendResultFooter(self): for key, values in times.items(): averages[key] = statistics.mean(values) if values else 0 - result = f'Average queue time {averages["hpc_queued"]:.1f} seconds, ' - result += f'average output wait time {averages["hpc_wait_output"]:.1f} seconds, ' - result += f'{num_resubmit} jobs resubmitted.' + stats = super().appendStats() + stats.update({'hpc_time_queue_average': averages['hpc_queued'], + 'hpc_time_wait_output_average': averages['hpc_wait_output'], + 'hpc_num_resubmitted': num_resubmit}) + return stats + + def appendResultFooter(self, stats): + result = f'Average queue time {stats["hpc_time_queue_average"]:.1f} seconds, ' + result += f'average output wait time {stats["hpc_time_wait_output_average"]:.1f} seconds, ' + result += f'{stats["hpc_num_resubmitted"]} jobs resubmitted.' return result def appendResultFileHeader(self): diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index 5092e80d9c73..b34832c430a1 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -548,7 +548,7 @@ def runJob(self, job, jobs): except KeyboardInterrupt: self.killRemaining(keyboard=True) - def appendResultFooter(self) -> str or None: + def appendResultFooter(self, stats: dict) -> str: """ Entrypoint to add additional results to the on screen result footer """ return None @@ -559,3 +559,7 @@ def appendResultFileHeader(self) -> dict: def appendResultFileJob(self, job) -> dict: """ Entrypoint to add entries to the result file for a job """ return {} + + def appendStats(self) -> dict: + """ Entrypoint to add entries to the harness statistics """ + return {}