diff --git a/.gitignore b/.gitignore index 86df037c5f3e..e73ac48094af 100644 --- a/.gitignore +++ b/.gitignore @@ -326,3 +326,14 @@ share/ /modules/misc/misc.yaml /tutorials/tutorial01_app_development/*/babbler.yaml /tutorials/darcy_thermo_mech/*/darcy_thermo_mech.yaml + +# test harness --sep-files output +*.runner_run_out.txt +*.runner_out.txt +*.tester_out.txt +*.job_out.txt + +# test harness hpc output +*.hpc_out.txt +*.hpc_result +*.hpc_submit diff --git a/CODEOWNERS b/CODEOWNERS index 9797647bfc5c..84c1c00d6c61 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -77,6 +77,7 @@ /python/MooseDocs @cticenhour /python/moosesqa @cticenhour +/python/TestHarness @loganharbour @milljm /scripts/hpc_proxy.pac @loganharbour /scripts/configure_petsc.sh @cticenhour @milljm @loganharbour diff --git a/framework/app.mk b/framework/app.mk index 5c281d576ba5..a8111811520a 100644 --- a/framework/app.mk +++ b/framework/app.mk @@ -498,6 +498,17 @@ install_data_%: @mkdir -p $($@_dst) @cp -r $($@_src) $($@_dst) +ifneq ($(wildcard $(APPLICATION_DIR)/scripts/TestHarness/testers),) +install_tester_$(APPLICATION_NAME)_src := $(APPLICATION_DIR)/scripts/TestHarness/testers +install_tester_$(APPLICATION_NAME)_dst := $(share_install_dir)/scripts/TestHarness +install_testers:: install_tester_$(APPLICATION_NAME) +endif + +install_tester_%: + @echo "Installing TestHarness testers "$($@_dst)"..." + @mkdir -p $($@_dst) + @cp -r $($@_src) $($@_dst) + $(copy_input_targets): @$(eval kv := $(subst ->, ,$(subst target_$(APPLICATION_NAME)_,,$@))) @$(eval source_dir := $(word 1, $(kv))) diff --git a/framework/moose.mk b/framework/moose.mk index 4fee7dc0770e..2e5a2dba7096 100644 --- a/framework/moose.mk +++ b/framework/moose.mk @@ -484,12 +484,15 @@ moose_share_dir = $(share_dir)/moose python_install_dir = $(moose_share_dir)/python bin_install_dir = $(PREFIX)/bin -install: all install_all_libs install_bin install_harness install_exodiff install_adreal_monolith install_hit install_data +install: all install_all_libs install_bin install_harness install_exodiff install_adreal_monolith install_hit install_data install_testers install_data:: @mkdir -p $(moose_share_dir) @cp -a $(FRAMEWORK_DIR)/data $(moose_share_dir)/ +install_testers:: + @: + install_adreal_monolith: ADRealMonolithic.h @ mkdir -p $(moose_include_dir) @cp -f $< $(moose_include_dir)/ diff --git a/modules/doc/content/application_development/index.md b/modules/doc/content/application_development/index.md index f915e35b0ec5..0b8c72d65ab6 100644 --- a/modules/doc/content/application_development/index.md +++ b/modules/doc/content/application_development/index.md @@ -20,8 +20,6 @@ These documentation pages are meant to be used by developers who are developing [Test System](/test_system.md) - How to create/maintain tests for your application -[Performance Benchmarking](/performance_benchmarking.md) - How to perform benchmarking - [Profiling](/profiling.md) - How to profile your application in order to determine what functions are hogging compute time. [Code Coverage](/coverage.md) - How to add automatic code coverage to your application, and use it in your development workflow diff --git a/modules/doc/content/application_development/performance_benchmarking.md b/modules/doc/content/application_development/performance_benchmarking.md index 1950bf72d33e..83c27f13038d 100644 --- a/modules/doc/content/application_development/performance_benchmarking.md +++ b/modules/doc/content/application_development/performance_benchmarking.md @@ -1,165 +1,4 @@ # Performance Benchmarking -Utilities for doing performance benchmarking of MOOSE-based applications are included in the main -MOOSE repository. These utilities provide functionality for benchmarking and tracking MOOSE -performance. They can be used to run benchmarks, generate trend visualizations, and look at stats -comparing benchmarks between various revisions. The following sections describe how to setup a -benchmark machine and use it to run benchmarks and visualize results. - -## Tuning a Benchmarking Machine - -In order to obtain accurate results, you need to run the benchmark process(es) -as close to isolated as possible. On a linux system, you should e.g. use cpu -isolation via setting kernel boot parameters: - -```text -isolcpus=[n] rcu_nocbs=[n] -``` - -in your boot loader (e.g. grub). The benchmarking tools/scripts in MOOSE should automatically -detect CPU isolation on Linux and schedule benchmark jobs to those CPUs. You should also disable -any turbo functionality. For example on `intel_pstate` driver cpus: - -```text -$ echo "1" > /sys/devices/system/cpu/intel_pstate/no_turbo -``` - -You will also want to turn off any hyperthreading for cores you use for benchmarking. You can do -this in the bios or by something like: - -```text -$ echo "0" > /sys/devices/system/cpu/cpu[n]/online -``` - -for each hyperthread core you want running - you can look in `/proc/cpuinfo` for pairs of cpus -that have the same core id turning off one of the pair. These will need to be done on every boot. -You can use the sysfsutils package and its `/etc/sysfs.conf` configuration file to do this -persistently on boot - i.e.: - -```text -devices/system/cpu/intel_pstate/no_turbo = 1 -devices/system/cpu/cpu3/online = 0 -devices/system/cpu/cpu5/online = 0 -``` - -## Test Harness Benchmarks - -Benchmarks can be run through the test harness (i.e. using the `run_tests` script) by doing -e.g. `./run_tests --run speedtests`. When this is done, the test harness looks for test spec -files named `speedtests` just like the `tests` files that contain regular moose test details. -The format for these files is: - -```text -[Benchmarks] - [benchmark-name] - type = SpeedTest - input = input-file-name.i - cli_args = '--an-arg=1 a/hit/format/cli/arg=foo' - # optional: - min_runs = 15 # default 40 - max_runs = 100 # default 400 - cumulative_dur = 100 # default 60 sec - [] - - [./benchmark2-name] - type = SpeedTest - input = another-input-file-name.i - cli_args = 'some/cli/arg=bar' - [] - - # ... -[] -``` - -After being run, benchmark data are stored in a sqlite database (default name -`speedtests.sqlite`). When the test harness is run without the `--run speedtests` flag, tests -described in `speedtests` files are run in *check-only* mode where moose just checks that their -input files are well-formed and parse correctly without actually running them. - - -## Manual/Direct Benchmarks - -The `[moose-repo]/scripts/benchmark.py` script can be used to manually list and directly run benchmarks without the -test harness (for hacking, debugging, etc.). To do this, the script reads a `bench.list` text -file that specifies which input files should be run and corresponding (benchmark) names for them -along with any optional arguments. The `bench.list` file has the following format: - -```text -[benchmarks] - [./simple_diffusion_refine3] - binary = test/moose_test-opt - input = test/tests/kernels/simple_diffusion/simple_diffusion.i - cli_args = 'Mesh/uniform_refine=3' - [../] - [./simple_diffusion_refine4] - binary = test/moose_test-opt - input = test/tests/kernels/simple_diffusion/simple_diffusion.i - cli_args = 'Mesh/uniform_refine=4' - [../] - [./simple_diffusion_ref5] - binary = test/moose_test-opt - input = test/tests/kernels/simple_diffusion/simple_diffusion.i - cli_args = 'Mesh/uniform_refine=5' - [../] - # ... add as many as you want -[] -``` - -To run the manual benchmarks directly, do this: - -```text -$ ./scripts/benchmark.py --run -``` - -When benchmarks are run, the binaries specified in `bench.list` must already exist. Benchmark -data are then stored in a sqlite database (default name `speedtests.sqlite`). You can specify -the minimum number of runs for each benchmark problem/simulation with the `--min-runs` (default -10). Each benchmark will be run as many times as possible within 1 minute (customizable via the -`--cum-dur` flag) or the specified minimum number of times (whichever is larger). - -## Analyzing Results - -Regardless of how you ran the benchmarks (either by this script or using the test harness), MOOSE -revisions with available benchmark data can be listed (from the database) by running: - -```text -$ ./benchmark.py --list-revs -44d2f3434b3346dc14fc9e86aa99ec433c1bbf10 2016-09-07 19:36:16 -86ced0d0c959c9bdc59497f0bc9324c5cdcd7e8f 2016-09-08 09:29:17 -447b455f1e2d8eda649468ed03ef792504d4b467 2016-09-08 09:43:56 -... -``` - -To look at stats comparing benchmark data from two revisions, run: - -```text -$ ./benchmark.py # defaults to using the most recent two revisions of benchmark data --------------------------------- 871c98630c98 to 38bb6f5ebe5f -------------------------------- - benchmark old (sec/run) new (sec/run) speedup (pvalue,nsamples) ----------------------------------------------------------------------------------------------- - simple diffusion (refine3): 0.408034 0.408034 ~ (p=0.996 n=36+36) - - simple diffusion (refine4): 1.554724 1.561682 ~ (p=0.571 n=10+10) - simple diffusion (refine5): 6.592326 6.592326 ~ (p=0.882 n=4+4) ----------------------------------------------------------------------------------------------- - -$ ./benchmark.py -old 44d2f34 -new 447b455 # or specify revisions to compare manually -------------------------------------- 44d2f34 to 447b455 ------------------------------------- - benchmark old (sec/run) new (sec/run) speedup (pvalue,nsamples) ----------------------------------------------------------------------------------------------- - simple diffusion (refine3): 0.416574 0.411435 -1.2% (p=0.000 n=37+37) - simple diffusion (refine4): 1.554724 1.497379 -3.7% (p=0.000 n=10+11) - simple diffusion (refine5): 6.553244 6.360004 -2.9% (p=0.030 n=4+4) ----------------------------------------------------------------------------------------------- -``` - -To generate visualizations, run: - -```text -$ ./scripts/benchmark.py --trends -``` - -This will generate an svg box plot for each benchmark over time/revision in a `trends` -subdirectory. An `index.html` file is also generated that embeds all the svg plots for -convenient viewing all together in a browser. - +!alert error title=Removed +This content has been removed. diff --git a/modules/doc/content/getting_started/examples_and_tutorials/tutorial01_app_development/step07_parallel.md b/modules/doc/content/getting_started/examples_and_tutorials/tutorial01_app_development/step07_parallel.md index c94c0812c394..7308f04cd300 100644 --- a/modules/doc/content/getting_started/examples_and_tutorials/tutorial01_app_development/step07_parallel.md +++ b/modules/doc/content/getting_started/examples_and_tutorials/tutorial01_app_development/step07_parallel.md @@ -69,8 +69,6 @@ There is an entire field of science about [!ac](HPC) and massively parallel proc !alert tip title=Try to target 20,000 [!ac](DOFs)-per-process. MOOSE developers tend to agree that 20,000 is the ideal number of [!ac](DOFs) that a single process may be responsible for. This value is reported as "`Num Local DOFs`" in the terminal printout at the beginning of every execution. There are, of course, some exceptions; if a problem exhibits speedup with less than 20,000 [!ac](DOFs)/process, then just use that. -*For more information about application performance, please visit the [application_development/performance_benchmarking.md] page.* - ## Demonstration To demonstrate the importance of parallel execution, the current Darcy pressure input file will be diff --git a/modules/doc/content/infrastructure/index.md b/modules/doc/content/infrastructure/index.md index 18f672117fa6..c57b56d0504f 100644 --- a/modules/doc/content/infrastructure/index.md +++ b/modules/doc/content/infrastructure/index.md @@ -9,4 +9,3 @@ of MOOSE and MOOSE-based applications: - [Python Tools](python/index.md) - [Build System](/build_system.md) - [Test System](/test_system.md) -- [Benchmarking](/performance_benchmarking.md) diff --git a/modules/ray_tracing/test/tests/actions/add_raybc_action/tests b/modules/ray_tracing/test/tests/actions/add_raybc_action/tests index ea1d3f6c6024..bd615011f36c 100644 --- a/modules/ray_tracing/test/tests/actions/add_raybc_action/tests +++ b/modules/ray_tracing/test/tests/actions/add_raybc_action/tests @@ -29,7 +29,7 @@ [multiple] type = RunException input = 'add_raybc_action.i' - cli_args = 'RayBCs/active=multiple_studies "UserObjects/active=\'study another_study\'"' + cli_args = 'RayBCs/active=multiple_studies UserObjects/active="study another_study"' expect_err = "While constructing the NullRayBC 'multiple_studies', multiple RayTracingStudy objects were found." allow_test_objects = true diff --git a/modules/ray_tracing/test/tests/actions/add_raykernel_action/tests b/modules/ray_tracing/test/tests/actions/add_raykernel_action/tests index 9840e54d6f1f..5135b40bfe9a 100644 --- a/modules/ray_tracing/test/tests/actions/add_raykernel_action/tests +++ b/modules/ray_tracing/test/tests/actions/add_raykernel_action/tests @@ -29,7 +29,7 @@ [multiple] type = RunException input = 'add_raykernel_action.i' - cli_args = 'RayKernels/active=multiple_studies "UserObjects/active=\'study another_study\'"' + cli_args = 'RayKernels/active=multiple_studies UserObjects/active="study another_study"' expect_err = "While constructing the NullRayKernel 'multiple_studies', multiple RayTracingStudy objects were found." allow_test_objects = true diff --git a/modules/ray_tracing/test/tests/outputs/ray_tracing_mesh_output/tests b/modules/ray_tracing/test/tests/outputs/ray_tracing_mesh_output/tests index 8d4f35b5c417..2aba96d03bdf 100644 --- a/modules/ray_tracing/test/tests/outputs/ray_tracing_mesh_output/tests +++ b/modules/ray_tracing/test/tests/outputs/ray_tracing_mesh_output/tests @@ -57,7 +57,7 @@ Mesh/Partitioner/nx=2 Mesh/Partitioner/ny=2 Outputs/rays/type=RayTracingNemesis - "Outputs/rays/output_properties=\'ray_id intersections pid processor_crossings trajectory_changes\'" + Outputs/rays/output_properties="ray_id intersections pid processor_crossings trajectory_changes" Outputs/rays/file_base=nemesis_rays' exodiff = 'nemesis_rays.e.4.0 nemesis_rays.e.4.1 nemesis_rays.e.4.2 nemesis_rays.e.4.3' min_parallel = 4 @@ -74,7 +74,7 @@ Mesh/Partitioner/nx=2 Mesh/Partitioner/ny=2 Outputs/rays/type=RayTracingNemesis - "Outputs/rays/output_properties=\'intersections pid processor_crossings trajectory_changes\'" + Outputs/rays/output_properties="intersections pid processor_crossings trajectory_changes" Outputs/rays/file_base=ray_mesh_output_transient_nemesis_rays' # Missing some files here beacuse exodiff doesn't like diffing empty output, # which is the case for the early transient when not all procs are hit @@ -107,7 +107,7 @@ cli_args = 'Mesh/Partitioner/type=GridPartitioner Mesh/Partitioner/ny=3 UserObjects/study/segments_on_cache_traces=false - "Outputs/rays/output_properties=\'ray_id intersections pid processor_crossings trajectory_changes\'" + Outputs/rays/output_properties="ray_id intersections pid processor_crossings trajectory_changes" Outputs/rays/file_base=no_segments_rays' exodiff = 'no_segments_rays.e' min_parallel = 3 @@ -122,7 +122,7 @@ Mesh/Partitioner/ny=2 Outputs/rays/type=RayTracingNemesis UserObjects/study/segments_on_cache_traces=false - "Outputs/rays/output_properties=\'ray_id intersections pid processor_crossings trajectory_changes\'" + Outputs/rays/output_properties="ray_id intersections pid processor_crossings trajectory_changes" Outputs/rays/file_base=no_segments_nemesis_rays' exodiff = 'no_segments_nemesis_rays.e.2.0 no_segments_nemesis_rays.e.2.1' min_parallel = 2 diff --git a/modules/ray_tracing/test/tests/traceray/internal_sidesets/tests b/modules/ray_tracing/test/tests/traceray/internal_sidesets/tests index aba5a26a614e..77ecf040d835 100644 --- a/modules/ray_tracing/test/tests/traceray/internal_sidesets/tests +++ b/modules/ray_tracing/test/tests/traceray/internal_sidesets/tests @@ -9,7 +9,7 @@ input = 'internal_sidesets_1d.i' csvdiff = 'internal_sidesets_1d_kill_out.csv' cli_args = 'Outputs/file_base=internal_sidesets_1d_kill_out - RayBCs/active=\'kill_internal\'' + RayBCs/active=kill_internal' allow_test_objects = true detail = 'one-dimensional meshes, ' @@ -19,7 +19,7 @@ input = 'internal_sidesets_2d.i' csvdiff = 'internal_sidesets_2d_kill_out.csv' cli_args = 'Outputs/file_base=internal_sidesets_2d_kill_out - RayBCs/active=\'kill_internal\'' + RayBCs/active=kill_internal' allow_test_objects = true detail = 'two-dimensional meshes, ' @@ -29,7 +29,7 @@ input = 'internal_sidesets_3d.i' csvdiff = 'internal_sidesets_3d_kill_out.csv' cli_args = 'Outputs/file_base=internal_sidesets_3d_kill_out - RayBCs/active=\'kill_internal\'' + RayBCs/active=kill_internal' allow_test_objects = true detail = 'and three-dimensional meshes.' @@ -43,7 +43,7 @@ input = 'internal_sidesets_1d.i' csvdiff = 'internal_sidesets_1d_reflect_out.csv' cli_args = 'Outputs/file_base=internal_sidesets_1d_reflect_out - "RayBCs/active=\'kill_external reflect_internal\'"' + RayBCs/active="kill_external reflect_internal"' allow_test_objects = true detail = 'one-dimensional meshes, ' @@ -54,7 +54,7 @@ input = 'internal_sidesets_2d.i' csvdiff = 'internal_sidesets_2d_reflect_out.csv' cli_args = 'Outputs/file_base=internal_sidesets_2d_reflect_out - "RayBCs/active=\'kill_external reflect_internal\'"' + RayBCs/active="kill_external reflect_internal"' allow_test_objects = true detail = 'two-dimensional meshes, ' @@ -65,7 +65,7 @@ input = 'internal_sidesets_3d.i' csvdiff = 'internal_sidesets_3d_reflect_out.csv' cli_args = 'Outputs/file_base=internal_sidesets_3d_reflect_out - "RayBCs/active=\'kill_external reflect_internal\'"' + RayBCs/active="kill_external reflect_internal"' allow_test_objects = true detail = 'and three-dimensional meshes.' @@ -76,7 +76,7 @@ type = RunException input = 'internal_sidesets_1d.i' cli_args = 'UserObjects/study/use_internal_sidesets=false - RayBCs/active=\'kill_internal\'' + RayBCs/active=kill_internal' expect_err = 'RayBCs are defined on internal sidesets, but the study is not set to use internal sidesets during tracing.' allow_test_objects = true diff --git a/modules/ray_tracing/test/tests/userobjects/ray_tracing_study/bc_create_ray/tests b/modules/ray_tracing/test/tests/userobjects/ray_tracing_study/bc_create_ray/tests index a30c94b6b812..d4fbd70eb070 100644 --- a/modules/ray_tracing/test/tests/userobjects/ray_tracing_study/bc_create_ray/tests +++ b/modules/ray_tracing/test/tests/userobjects/ray_tracing_study/bc_create_ray/tests @@ -19,7 +19,7 @@ allow_test_objects = true cli_args = 'Outputs/file_base=bc_create_ray_3d_out Mesh/active=gmg_3d - "RayBCs/active=\'kill_3d create_3d\'"' + RayBCs/active="kill_3d create_3d"' detail = 'and in three-dimensional meshes.' [] [] diff --git a/modules/thermal_hydraulics/test/tests/utils/logger/tests b/modules/thermal_hydraulics/test/tests/utils/logger/tests index a87fbe8267f5..5da6d319da5f 100644 --- a/modules/thermal_hydraulics/test/tests/utils/logger/tests +++ b/modules/thermal_hydraulics/test/tests/utils/logger/tests @@ -14,7 +14,9 @@ [errors] type = RunException input = 'test.i' - expect_err = "componentB: warning 2.*componentA: error 1.*componentA: error 2.*componentB: error 1.*componentB: error 2" + # We can't reliably check for the warnings first here because the ordering of the warning + # and the error is MPI implementation dependent + expect_err = "componentA: error 1.*componentA: error 2.*componentB: error 1.*componentB: error 2" allow_test_objects = true allow_warnings = true # Testing that warnings are emitted requirement = 'The system shall be able to output errors in a batch.' diff --git a/python/TestHarness/JobDAG.py b/python/TestHarness/JobDAG.py index 199cd512441a..41b31bf6e064 100644 --- a/python/TestHarness/JobDAG.py +++ b/python/TestHarness/JobDAG.py @@ -9,35 +9,24 @@ from .schedulers.Job import Job from contrib import dag -import pyhit -import os +import sys +import threading class JobDAG(object): """ Class which builds a Job DAG for use by the Scheduler """ - def __init__(self, options): - self.__job_dag = dag.DAG() - self.__parallel_scheduling = None + def __init__(self, options, parallel_scheduling): self.options = options + self.__parallel_scheduling = parallel_scheduling + self.__job_dag = dag.DAG() + self.__j_lock = threading.Lock() - def _setParallel(self): - """ Read the test spec file and determine if parallel_scheduling is set. """ - if self.__parallel_scheduling is not None: - return self.__parallel_scheduling - self.__parallel_scheduling = False - - job = self.getJob() - if job: - # We only need a single tester so we know what spec file to load. - # TODO: would be nice to have access to this without needing tester.specs - tester = job[0].getTester() - root = pyhit.load(os.path.join(tester.specs['test_dir'], tester.specs['spec_file'])) - self.__parallel_scheduling = root.children[0].get('parallel_scheduling', False) - - return self.__parallel_scheduling + def getLock(self): + """ Return the lock for this test spec (folder of jobs) """ + return self.__j_lock def canParallel(self): """ Return bool whether or not this group runs in parallel """ - return self._setParallel() + return self.__parallel_scheduling def createJobs(self, testers): """ Return a usable Job DAG based on supplied list of tester objects """ @@ -45,7 +34,7 @@ def createJobs(self, testers): self.__name_to_job = {} for tester in testers: job = Job(tester, self, self.options) - name = job.getTestName() + name = job.getTestNameShort() if name not in self.__name_to_job: self.__name_to_job[name] = job else: @@ -59,18 +48,12 @@ def getDAG(self): """ return the running DAG object """ return self.__job_dag - def getJobs(self): + def getAvailableJobs(self): """ Return a list of available jobs """ - if (self.canParallel() or self.options.pbs) and not self.options.pedantic_checks: - return self.__job_dag.ind_nodes() - return self.getJob() - - def getJob(self): - """ Return a single available job """ - concurrent_jobs = self.__job_dag.ind_nodes() - if [x for x in concurrent_jobs if x.isHold()]: - return [[x for x in concurrent_jobs if x.isHold()][0]] - return [] + available_jobs = [job for job in self.__job_dag.ind_nodes() if job.isHold()] + if self.canParallel() and not self.options.pedantic_checks: + return available_jobs + return available_jobs[0:1] def getJobsAndAdvance(self): """ @@ -85,14 +68,14 @@ def getJobsAndAdvance(self): next_jobs.add(job) self.__job_dag.delete_node(job) - next_jobs.update(self.getJobs()) + next_jobs.update(self.getAvailableJobs()) return next_jobs def removeAllDependencies(self): """ Flatten current DAG so that it no longer contains any dependency information """ if self.__name_to_job and self.__job_dag.size(): tmp_job_dag = dag.DAG() - for job in self.__job_dag.topological_sort(): + for job in self.getJobs(): tmp_job_dag.add_node(job) self.__job_dag = tmp_job_dag return self.__job_dag @@ -100,13 +83,14 @@ def removeAllDependencies(self): def _checkDAG(self): """ perform some sanity checks on the current DAG """ if self.__job_dag.size(): + # Add edges based on prereqs + self._setupPrereqs() - self._doMakeDependencies() - self._doLast() + # Check for race conditions in output + self._checkOutputCollisions() - # If there are race conditions, then there may be more skipped jobs - if self._doRaceConditions(): - self._doSkippedDependencies() + # Remove edges for jobs that are skipped + self._doSkippedDependencies() return self.__job_dag @@ -119,25 +103,20 @@ def _addEdge(self, child, parent): err_output += ' %s <--> %s' % (parent.getTestName().split('.')[1], child.getTestName().split('.')[1]) - parent.setOutput('Cyclic dependency error!\n\t' + err_output) + parent.appendOutput('Cyclic dependency error!\n\t' + err_output) parent.setStatus(parent.error, 'Cyclic or Invalid Dependency Detected!') - def _doLast(self): - for job in self.__job_dag.topological_sort(): - if 'ALL' in job.getPrereqs(): - for a_job in self.__job_dag.topological_sort(): - if a_job != job and not a_job.isSkip(): - if '.ALL' in a_job.getTestName(): - a_job.setStatus(a_job.error, 'Test named ALL when "prereq = ALL" elsewhere in test spec file!') - self._addEdge(a_job, job) - self._doSkippedDependencies() - - def _doMakeDependencies(self): + def _setupPrereqs(self): """ Setup dependencies within the current Job DAG """ + # The jobs that have 'ALL' as a prereq + all_prereq_jobs = [] + + # Setup explicit dependencies (without 'ALL') for job in self.__job_dag.ind_nodes(): prereq_jobs = job.getPrereqs() if prereq_jobs == ['ALL']: - prereq_jobs = [] + all_prereq_jobs.append(job) + continue for prereq_job in prereq_jobs: try: self.__name_to_job[prereq_job] @@ -147,7 +126,15 @@ def _doMakeDependencies(self): # test file has invalid prereq set except KeyError: - job.setStatus(job.error, 'unknown dependency') + job.setStatus(job.error, f'unknown dependency {prereq_job}') + + # Setup dependencies for 'ALL' + for job in all_prereq_jobs: + for a_job in self.getJobs(): + if a_job != job and not a_job.isSkip(): + if '.ALL' in a_job.getTestName(): + a_job.setStatus(a_job.error, 'Test named ALL when "prereq = ALL" elsewhere in test spec file!') + self._addEdge(a_job, job) def _fix_cornercases(self, prereq_job, job): """ @@ -164,7 +151,7 @@ def _fix_cornercases(self, prereq_job, job): def _hasDownStreamsWithFailures(self, job): """ Return True if any dependents of job has previous failures """ for d_job in self.__job_dag.all_downstreams(job): - status, message, caveats = d_job.previousTesterStatus(self.options) + status, message, caveats = d_job.previousTesterStatus() if status in d_job.job_status.getFailingStatuses(): return True @@ -173,7 +160,7 @@ def _doPreviouslyFailed(self, job): Set up statuses for jobs contained within the DAG for use with failed-tests option """ tester = job.getTester() - status, message, caveats = job.previousTesterStatus(self.options) + status, message, caveats = job.previousTesterStatus() # This job passed, but one of its dependents has not if status == tester.success and self._hasDownStreamsWithFailures(job): @@ -196,7 +183,7 @@ def _doPreviouslyFailed(self, job): def _doSkippedDependencies(self): """ Determine which jobs in the DAG should be skipped """ - for job in list(self.__job_dag.topological_sort()): + for job in self.getJobs(): dep_jobs = set([]) if self.options.failed_tests: @@ -219,37 +206,42 @@ def _doSkippedDependencies(self): d_job.addCaveats('skipped dependency') self.__job_dag.delete_edge_if_exists(job, d_job) - def _doRaceConditions(self): - """ Check for race condition errors within in the DAG""" - # Build output_file in relation to job dictionary - output_to_job = {} - for job in self.__job_dag.topological_sort(): - if job.getRunnable() and not job.isFinished(): - for output_file in job.getOutputFiles(): - output_to_job[output_file] = output_to_job.get(output_file, []) - output_to_job[output_file].append(job) - - # Remove jobs which have accurate dependencies - for outfile, job_list in output_to_job.items(): - for job in list(job_list): - for match_job in self.__job_dag.all_downstreams(job): - if match_job in job_list: - job_list.remove(match_job) - - # Left over multiple items in job_list are problematic - for outfile, job_list in output_to_job.items(): - # Same test has duplicate output files - if len(job_list) > 1 and len(set(job_list)) == 1: - job_list[0].setOutput('Duplicate output files:\n\t%s\n' % (outfile)) - job_list[0].setStatus(job.error, 'DUPLICATE OUTFILES') - - # Multiple tests will clobber eachothers output file - # Only check this with parallel_scheduling enabled because otherwise - # all of these jobs will be serialized - elif len(job_list) > 1 and self._setParallel(): - for job in job_list: - job.setOutput('Output file will over write pre-existing output file:\n\t%s\n' % (outfile)) - job.setStatus(job.error, 'OUTFILE RACE CONDITION') + def _checkOutputCollisions(self): + """ + If running in parallel, checks to see if any tests have outputs + that would collide when ran in parallel if prereqs are set. + """ + # No need to check if this spec can't run in parallel, because + # all tests will be run sequentially, with no more than one at once + if not self.canParallel(): + return + + # Sort by ID so we get it in the input file from top down + jobs = sorted(self.getJobs(), key = lambda job: job.getID()) + + # Work down the file, starting with the second input and looking up for + # collisions. By doing it in this order, we will error at the first occurance. + # This is nice because if we list all of the collisions it could be a lot of + # confusing output + for i in range(1, len(jobs)): + job = jobs[i] + for other_i in reversed(range(i)): + other_job = jobs[other_i] + tester = job.getTester() + other_tester = other_job.getTester() + files = set(tester.getOutputFiles(self.options)) + other_files = set(other_tester.getOutputFiles(self.options)) + conflicting_files = list(files.intersection(other_files)) + if conflicting_files \ + and not self.__job_dag.is_dependency(other_job, job) \ + and not self.__job_dag.is_dependency(job, other_job): + print(' This test spec is set to run in parallel, but a race condition was found') + print(' that could lead to multiple tests reading/writing from the same file.\n') + print(f' Tests: {tester.getTestNameShort()}, {other_tester.getTestNameShort()}') + print(f' File(s): {", ".join(conflicting_files)}\n') + print(' You can resolve this issue by setting the approprate prerequisites') + print(' between your tests with the "prereq" parameter') + sys.exit(1) def _skipPrereqs(self): """ @@ -272,6 +264,12 @@ def _printDownstreams(self, job): cyclic_path.append('%s -->'% (d_job.getTestNameShort())) return ' '.join(cyclic_path) + def getJobs(self): + """ + Returns the sorted jobs in the DAG + """ + return self.__job_dag.topological_sort() + def printDAG(self): """ Print the current structure of the DAG """ job_order = [] diff --git a/python/TestHarness/OutputInterface.py b/python/TestHarness/OutputInterface.py new file mode 100644 index 000000000000..b6e54a7b2548 --- /dev/null +++ b/python/TestHarness/OutputInterface.py @@ -0,0 +1,137 @@ +#* This file is part of the MOOSE framework +#* https://www.mooseframework.org +#* +#* All rights reserved, see COPYRIGHT for full restrictions +#* https://github.com/idaholab/moose/blob/master/COPYRIGHT +#* +#* Licensed under LGPL 2.1, please see LICENSE for details +#* https://www.gnu.org/licenses/lgpl-2.1.html + +import os +import json + +class OutputInterface: + """ Helper class for writing output to either memory or a file """ + def __init__(self): + # The in-memory output, if any + self.output = '' + # The path to write output to, if any + self.separate_output_path = None + + class BadOutputException(Exception): + """ Exception that is thrown when bad output is detected """ + def __init__(self, errors): + self.errors = errors + message = 'Bad output detected: ' + ', '.join(errors) + super().__init__(message) + + def setSeparateOutputPath(self, separate_output_path): + """ Sets the path for writing output to """ + self.separate_output_path = separate_output_path + + # If we have any dangling output, write it + if self.output: + self.setOutput(self.output) + self.output = '' + + def getSeparateOutputFilePath(self) -> str: + """ Gets the path that this output is writing to, if any """ + return self.separate_output_path + + def hasOutput(self) -> bool: + """ Whether or not this object has any content written """ + if self.separate_output_path: + return os.path.isfile(self.separate_output_path) + return len(self.output) > 0 + + def getOutput(self, sanitize: bool = True) -> str: + """ + Gets the underlying output, either from file or memory + + The sanitize parameter triggers whether or not to check + for bad output, in which case an exception will be thrown + if it is found. The intention here is to sanitize it + ahead of time with self.sanitizeOutput() so that you can + clean it then and appropriately report the error earlier + on before the output is used. + """ + output = '' + if self.separate_output_path: + try: + output = open(self.separate_output_path, 'r').read() + except FileNotFoundError: + pass + else: + output = self.output + + if sanitize: + _, sanitize_failures = self._sanitizeOutput(output) + if sanitize_failures: + raise self.BadOutputException(sanitize_failures) + + return output + + def setOutput(self, output: str): + """ Sets the output given some output string """ + if not output: + return + if self.separate_output_path: + open(self.separate_output_path, 'w').write(output) + else: + self.output = output + + def appendOutput(self, output: str): + """ Appends to the output """ + if not output: + return + if self.separate_output_path: + open(self.separate_output_path, 'a').write(output) + else: + self.output += output + + def clearOutput(self): + """ Clears the output """ + if self.separate_output_path: + if os.path.exists(self.separate_output_path): + os.remove(self.separate_output_path) + else: + self.output = '' + + @staticmethod + def _sanitizeOutput(output): + """ + Internal method for taking an output string, sanitizing + it if needed, and then returning a list of the failures + that were encountered (if any) + """ + failures = set() + + # Check for invalid characters + try: + json.dumps(output) + except UnicodeDecodeError: + # Convert invalid output to something json can handle + output = output.decode('utf-8','replace').encode('ascii', 'replace') + # Alert the user that output has invalid characters + failures.add('invalid output characters') + + # Check for NULL characters + null_chars = ['\0', '\x00'] + for null_char in null_chars: + if null_char in output: + output = output.replace(null_char, 'NULL') + failures.add('NULL output') + + return output, list(failures) + + def sanitizeOutput(self): + """ + Sanitizes the output in place and returns a list of the + checks that failed, if any. + + Should be called before processing the output. + """ + output, failures = self._sanitizeOutput(self.getOutput(sanitize=False)) + if failures: + self.setOutput(output) + return failures diff --git a/python/TestHarness/StatusSystem.py b/python/TestHarness/StatusSystem.py index a347abdcb82d..6ea71e73b989 100644 --- a/python/TestHarness/StatusSystem.py +++ b/python/TestHarness/StatusSystem.py @@ -8,6 +8,8 @@ #* https://www.gnu.org/licenses/lgpl-2.1.html from collections import namedtuple +import threading +import contextlib def initStatus(): status = namedtuple('status', 'status color code sort_value') @@ -20,6 +22,8 @@ class StatusSystem(object): """ A Class for supplying statuses, with status text color and corresponding exit codes. + Set locking=True within the initializer to enable thread-safe access. + Syntax: status = StatusSystem() @@ -103,8 +107,23 @@ class StatusSystem(object): queued, running] - def __init__(self): + def __init__(self, locking=False): + # The underlying status self.__status = self.no_status + # The lock for reading/changing the status, if any + if locking: + self.__lock = threading.Lock() + else: + self.__lock = None + + def getLock(self): + """ + Gets the thread lock for this system, if any. + + This is safe to use in a with statement even if locking + is not enabled. + """ + return self.__lock if self.__lock else contextlib.suppress() def createStatus(self, status_key='NA'): """ return a specific status object based on supplied status name """ @@ -115,39 +134,50 @@ def createStatus(self, status_key='NA'): def getStatus(self): """ Return the status object. + + This is thread-safe if initialized with locking=True. """ - return self.__status + with self.getLock(): + return self.__status - def getAllStatuses(self): + @staticmethod + def getAllStatuses(): """ return list of named tuples containing all status types """ - return self.__all_statuses + return StatusSystem.__all_statuses - def getFailingStatuses(self): + @staticmethod + def getFailingStatuses(): """ return list of named tuples containing failing status types """ - return self.__exit_nonzero_statuses + return StatusSystem.__exit_nonzero_statuses - def getSuccessStatuses(self): + @staticmethod + def getSuccessStatuses(): """ return list of named tuples containing exit code zero status types """ - return self.__exit_zero_statuses + return StatusSystem.__exit_zero_statuses - def getPendingStatuses(self): + @staticmethod + def getPendingStatuses(): """ return list of named tuples containing pending status types """ - return self.__pending_statuses + return StatusSystem.__pending_statuses def setStatus(self, status=no_status): """ Set the current status to status. If status is not supplied, 'no_status' is implied. There is a validation check during this process to ensure the named tuple adheres to this class's set statuses. - """ - if self.isValid(status): - self.__status = status - else: - raise StatusSystemError('Invalid status! %s' % (str(status))) - return self.__status - def isValid(self, status): - original = set(self.no_status._asdict().keys()) + This is thread-safe if initialized with locking=True. + """ + with self.getLock(): + if self.isValid(status): + self.__status = status + else: + raise StatusSystemError('Invalid status! %s' % (str(status))) + return self.__status + + @staticmethod + def isValid(status): + original = set(StatusSystem.no_status._asdict().keys()) altered = set(status._asdict().keys()) - if not original.difference(altered) or status in self.__all_statuses: + if not original.difference(altered) or status in StatusSystem.getAllStatuses(): return True diff --git a/python/TestHarness/TestHarness.py b/python/TestHarness/TestHarness.py index 6e4cfd85bd4e..28a531475329 100644 --- a/python/TestHarness/TestHarness.py +++ b/python/TestHarness/TestHarness.py @@ -15,6 +15,9 @@ from . import RaceChecker import subprocess import shutil +import socket +import datetime +import getpass from socket import gethostname from FactorySystem.Factory import Factory @@ -208,19 +211,27 @@ def __init__(self, argv, moose_dir, app_name=None, moose_python=None): # Build a Warehouse to hold the MooseObjects self.warehouse = Warehouse() - # Get dependant applications and load dynamic tester plugins - # If applications have new testers, we expect to find them in /scripts/TestHarness/testers + # Testers from this directory dirs = [os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))] - dirs.append(os.path.join(moose_dir, 'share', 'moose', 'python', 'TestHarness', 'testers')) - # Use the find_dep_apps script to get the dependant applications for an app - depend_app_dirs = findDepApps(app_name, use_current_only=True) - dirs.extend([os.path.join(my_dir, 'scripts', 'TestHarness') for my_dir in depend_app_dirs.split('\n')]) + # Get dependent applications and load dynamic tester plugins + # If applications have new testers, we expect to find them in /scripts/TestHarness/testers + # Use the find_dep_apps script to get the dependent applications for an app + app_dirs = findDepApps(app_name, use_current_only=True).split('\n') + # For installed binaries, the apps will exist in RELEASE_PATH/scripts, where in + # this case RELEASE_PATH is moose_dir + share_dir = os.path.join(moose_dir, 'share') + if os.path.isdir(share_dir): + for dir in os.listdir(share_dir): + if dir != 'moose': # already included + app_dirs.append(os.path.join(share_dir, dir)) + # Add scripts/TestHarness for all of the above + dirs.extend([os.path.join(my_dir, 'scripts', 'TestHarness') for my_dir in app_dirs]) # Finally load the plugins! self.factory.loadPlugins(dirs, 'testers', "IS_TESTER") - self._infiles = ['tests', 'speedtests'] + self._infiles = ['tests'] self.parse_errors = [] self.test_table = [] self.num_passed = 0 @@ -234,7 +245,6 @@ def __init__(self, argv, moose_dir, app_name=None, moose_python=None): self.code = b'2d2d6769726c2d6d6f6465' self.error_code = 0x0 self.keyboard_talk = True - self.results_file = '.previous_test_results.json' # Assume libmesh is a peer directory to MOOSE if not defined if "LIBMESH_DIR" in os.environ: self.libmesh_dir = os.environ['LIBMESH_DIR'] @@ -356,7 +366,7 @@ def __init__(self, argv, moose_dir, app_name=None, moose_python=None): def findAndRunTests(self, find_only=False): self.error_code = 0x0 self.preRun() - self.start_time = clock() + self.start_time = datetime.datetime.now() launched_tests = [] if self.options.input_file_name != '': self._infiles = self.options.input_file_name.split(',') @@ -429,7 +439,6 @@ def findAndRunTests(self, find_only=False): # Create the testers for this test testers = self.createTesters(dirpath, file, find_only, testroot_params) - # Schedule the testers (non blocking) self.scheduler.schedule(testers) @@ -525,8 +534,8 @@ def augmentParameters(self, filename, tester, testroot_params={}): params['spec_file'] = filename params['test_name'] = formatted_name + params['test_name_short'] = relative_hitpath params['test_dir'] = test_dir - params['relative_path'] = relative_path params['executable'] = testroot_params.get("executable", self.executable) params['app_name'] = self.app_name params['hostname'] = self.host_name @@ -540,8 +549,6 @@ def augmentParameters(self, filename, tester, testroot_params={}): if type(params['prereq']) != list: print(("Option 'prereq' needs to be of type list in " + params['test_name'])) sys.exit(1) - elif (params['prereq'] != ['ALL']): - params['prereq'] = [relative_path.replace('/tests/', '') + '.' + item for item in params['prereq']] # Double the alloted time for tests when running with the valgrind option tester.setValgrindMode(self.options.valgrind_mode) @@ -567,8 +574,8 @@ def appendRecoverableTests(self, testers): part2 = copy.deepcopy(part1) # Part 1: + part1.appendTestName('_part1') part1_params = part1.parameters() - part1_params['test_name'] += '_part1' part1_params['cli_args'].append('--test-checkpoint-half-transient') if self.options.recoversuffix == 'cpa': part1_params['cli_args'].append('Outputs/out/type=Checkpoint') @@ -577,7 +584,7 @@ def appendRecoverableTests(self, testers): # Part 2: part2_params = part2.parameters() - part2_params['prereq'].append(part1.parameters()['test_name']) + part2_params['prereq'].append(part1.getTestNameShort()) part2_params['delete_output_before_running'] = False part2_params['cli_args'].append('--recover --recoversuffix ' + self.options.recoversuffix) part2.addCaveats('recover') @@ -596,23 +603,7 @@ def checkExpectError(self, output, expect_error): else: return True - def printOutput(self, job, color): - """ Method to print a testers output to the screen """ - output = '' - # Print what ever status the tester has at the time - if self.options.verbose or (job.isFail() and not self.options.quiet): - output = 'Working Directory: ' + job.getTestDir() + '\nRunning command: ' + job.getCommand() + '\n' - output += util.trimOutput(job, self.options) - output = output.replace('\r', '\n') # replace the carriage returns with newlines - lines = output.split('\n') - - if output != '': - test_name = util.colorText(job.getTestName() + ": ", color, colored=self.options.colored, code=self.options.code) - output = test_name + ("\n" + test_name).join(lines) - print(output) - return output - - def handleJobStatus(self, job): + def handleJobStatus(self, job, caveats=None): """ The Scheduler is calling back the TestHarness to inform us of a status change. The job may or may not be finished yet (RUNNING), or failing, passing, etc. @@ -622,19 +613,22 @@ def handleJobStatus(self, job): elif not job.isSilent(): # Print results and perform any desired post job processing if job.isFinished(): - status, message, color, status_code, sort_value = job.getJointStatus() - self.error_code = self.error_code | status_code + joint_status = job.getJointStatus() + self.error_code = self.error_code | joint_status.status_code # perform printing of application output if so desired - self.printOutput(job, color) + output = job.getOutputForScreen() + if output: + print(output) - # Print status with caveats - print((util.formatResult(job, self.options, caveats=True))) + # Print status with caveats (if caveats not overridden) + caveats = True if caveats is None else caveats + print(util.formatResult(job, self.options, caveats=caveats), flush=True) timing = job.getTiming() # Save these results for 'Final Test Result' summary - self.test_table.append( (job, sort_value, timing) ) + self.test_table.append( (job, joint_status.sort_value, timing) ) self.postRun(job.specs, timing) if job.isSkip(): @@ -648,32 +642,49 @@ def handleJobStatus(self, job): # Just print current status without saving results else: - print((util.formatResult(job, self.options, result='RUNNING', caveats=False))) + caveats = False if caveats is None else caveats + print(util.formatResult(job, self.options, result=job.getStatus().status, caveats=caveats), flush=True) + + def getStats(self, time_total: float) -> dict: + """ + Get cumulative stats for all runs + """ + num_nonzero_timing = sum(1 if float(tup[0].getTiming()) > 0 else 0 for tup in self.test_table) + if num_nonzero_timing > 0: + time_max = max(float(tup[0].getTiming()) for tup in self.test_table) + time_average = sum(float(tup[0].getTiming()) for tup in self.test_table) / num_nonzero_timing + else: + time_max = 0 + time_average = 0 + + stats = {'num_passed': self.num_passed, + 'num_failed': self.num_failed, + 'num_skipped': self.num_skipped, + 'num_total': self.num_passed + self.num_failed + self.num_skipped, + 'time_total': time_total, + 'time_max': time_max, + 'time_average': time_average} + stats.update(self.scheduler.appendStats()) + return stats # Print final results, close open files, and exit with the correct error code def cleanup(self): - if self.options.queue_cleanup and self.options.results_file: - try: - os.remove(self.options.results_file) - except OSError: - pass - return - # Print the results table again if a bunch of output was spewed to the screen between # tests as they were running if len(self.parse_errors) > 0: - print(('\n\nParser Errors:\n' + ('-' * (util.TERM_COLS)))) + print(('\n\nParser Errors:\n' + ('-' * (self.options.term_cols)))) for err in self.parse_errors: print((util.colorText(err, 'RED', html=True, colored=self.options.colored, code=self.options.code))) if (self.options.verbose or (self.num_failed != 0 and not self.options.quiet)) and not self.options.dry_run: - print(('\n\nFinal Test Results:\n' + ('-' * (util.TERM_COLS)))) + print(('\n\nFinal Test Results:\n' + ('-' * (self.options.term_cols)))) for (job, sort_value, timing) in sorted(self.test_table, key=lambda x: x[1]): print((util.formatResult(job, self.options, caveats=True))) - time = clock() - self.start_time + time_total = (datetime.datetime.now() - self.start_time).total_seconds() + stats = self.getStats(time_total) - print(('-' * (util.TERM_COLS))) + print(('-' * (self.options.term_cols))) # Mask off TestHarness error codes to report parser errors fatal_error = '' @@ -681,19 +692,12 @@ def cleanup(self): fatal_error += ', FATAL PARSER ERROR' self.error_code = self.error_code | 0x80 - # Alert the user to their session file - if self.options.queueing and not self.options.dry_run: - print(('Your session file is %s' % self.options.results_file)) - # Print a different footer when performing a dry run if self.options.dry_run: - print(('Processed %d tests in %.1f seconds.' % (self.num_passed+self.num_skipped, time))) - summary = '%d would run' - summary += ', %d would be skipped' + print(f'Processed {self.num_passed + self.num_skipped} tests in {stats["time_total"]:.1f} seconds.') + summary = f'{self.num_passed} would run, {self.num_skipped} would be skipped' summary += fatal_error - print((util.colorText( summary % (self.num_passed, self.num_skipped), "", html = True, \ - colored=self.options.colored, code=self.options.code ))) - + print(util.colorText(summary, "", html=True, colored=self.options.colored, code=self.options.code)) else: num_nonzero_timing = sum(1 if float(tup[0].getTiming()) > 0 else 0 for tup in self.test_table) if num_nonzero_timing > 0: @@ -702,36 +706,41 @@ def cleanup(self): else: timing_max = 0 timing_avg = 0 - print(('Ran %d tests in %.1f seconds. Average test time %.1f seconds, maximum test time %.1f seconds.' % (self.num_passed+self.num_failed, time, timing_avg, timing_max))) + summary = f'Ran {self.num_passed + self.num_failed} tests in {stats["time_total"]:.1f} seconds.' + summary += f' Average test time {timing_avg:.1f} seconds,' + summary += f' maximum test time {timing_max:.1f} seconds.' + print(summary) + + # Get additional results from the scheduler + scheduler_summary = self.scheduler.appendResultFooter(stats) + if scheduler_summary: + print(scheduler_summary) if self.num_passed: - summary = '%d passed' + summary = f'{self.num_passed} passed' else: - summary = '%d passed' - summary += ', %d skipped' + summary = f'{self.num_passed} passed' + summary += f', {self.num_skipped} skipped' if self.num_pending: - summary += ', %d pending' - else: - summary += ', %d pending' + summary += f', {self.num_pending} pending' if self.num_failed: - summary += ', %d FAILED' + summary += f', {self.num_failed} FAILED' else: - summary += ', %d failed' + summary += f', {self.num_failed} failed' if self.scheduler.maxFailures(): self.error_code = self.error_code | 0x80 summary += '\nMAX FAILURES REACHED' summary += fatal_error - print((util.colorText( summary % (self.num_passed, self.num_skipped, self.num_pending, self.num_failed), "", html = True, \ - colored=self.options.colored, code=self.options.code ))) + print(util.colorText(summary, "", html=True, colored=self.options.colored, code=self.options.code)) if self.options.longest_jobs: # Sort all jobs by run time sorted_tups = sorted(self.test_table, key=lambda tup: float(tup[0].getTiming()), reverse=True) print('\n%d longest running jobs:' % self.options.longest_jobs) - print(('-' * (util.TERM_COLS))) + print(('-' * (self.options.term_cols))) # Copy the current options and force timing to be true so that # we get times when we call formatResult() below @@ -748,7 +757,7 @@ def cleanup(self): # The TestHarness receives individual jobs out of order (can't realistically use self.test_table) tester_dirs = {} dag_table = [] - for jobs, dag, thread_lock in self.scheduler.retrieveDAGs(): + for jobs, dag in self.scheduler.retrieveDAGs(): original_dag = dag.getOriginalDAG() total_time = float(0.0) tester = None @@ -764,94 +773,32 @@ def cleanup(self): sorted_table = sorted(dag_table, key=lambda dag_table: float(dag_table[1]), reverse=True) if sorted_table[0:self.options.longest_jobs]: print(f'\n{self.options.longest_jobs} longest running folders:') - print(('-' * (util.TERM_COLS))) + print(('-' * (self.options.term_cols))) # We can't use util.formatResults, as we are representing a group of testers for group in sorted_table[0:self.options.longest_jobs]: - print(str(group[0]).ljust((util.TERM_COLS - (len(group[1]) + 4)), ' '), f'[{group[1]}s]') + print(str(group[0]).ljust((self.options.term_cols - (len(group[1]) + 4)), ' '), f'[{group[1]}s]') print('\n') - # Perform any write-to-disc operations - self.writeResults() - - def writeResults(self): - """ Don't update the results file when using the --failed-tests argument """ - if self.options.failed_tests or self.options.show_last_run: - return - - """ write test results to disc in some fashion the user has requested """ - all_jobs = self.scheduler.retrieveJobs() - - # Gather and print the jobs with race conditions after the jobs are finished - # and only run when running --diag. - if self.options.pedantic_checks: - checker = RaceChecker.RaceChecker(all_jobs) - if checker.findRacePartners(): - # Print the unique racer conditions and adjust our error code. - self.error_code = checker.printUniqueRacerSets() - else: - print("There are no race conditions.") - - # Record the input file name that was used - self.options.results_storage['INPUT_FILE_NAME'] = self.options.input_file_name - - # Record that we are using --sep-files* options - self.options.results_storage['SEP_FILES'] = (True if self.options.pbs else False - or self.options.ok_files - or self.options.fail_files - or self.options.sep_files) - - # Record the Scheduler Plugin used - self.options.results_storage['SCHEDULER'] = self.scheduler.__class__.__name__ - - # Write some useful data to our results_storage - for job_group in all_jobs: - for job in job_group: - # If queueing, do not store silent results in session file - if job.isSilent() and self.options.queueing: - continue - - status, message, message_color, status_code, sort_value = job.getJointStatus() - - # Create empty key based on TestDir, or re-inialize with existing data so we can append to it - self.options.results_storage[job.getTestDir()] = self.options.results_storage.get(job.getTestDir(), {}) - - # If output has been stored in separate files, don't make additional copies by - # storing that data in this json results file (--pbs || --sep-files, etc options). - output = '' if job.getOutputFile() else job.getOutput() - - self.options.results_storage[job.getTestDir()][job.getTestName()] = {'NAME' : job.getTestNameShort(), - 'LONG_NAME' : job.getTestName(), - 'TIMING' : job.getTiming(), - 'STATUS' : status, - 'STATUS_MESSAGE' : message, - 'FAIL' : job.isFail(), - 'COLOR' : message_color, - 'CAVEATS' : list(job.getCaveats()), - 'OUTPUT' : output, - 'COMMAND' : job.getCommand(), - 'META_DATA' : job.getMetaData()} - - # Additional data to store (overwrites any previous matching keys) - self.options.results_storage[job.getTestDir()].update(job.getMetaData()) - - if self.options.output_dir: - self.options.results_file = os.path.join(self.options.output_dir, self.options.results_file) + all_jobs = self.scheduler.retrieveJobs() - if self.options.results_storage and self.options.results_file: - try: - with open(self.options.results_file, 'w') as data_file: - json.dump(self.options.results_storage, data_file, indent=2) - except UnicodeDecodeError: - print('\nERROR: Unable to write results due to unicode decode/encode error') + # Gather and print the jobs with race conditions after the jobs are finished + # and only run when running --pedantic-checks. + if self.options.pedantic_checks: + checker = RaceChecker.RaceChecker(all_jobs) + if checker.findRacePartners(): + # Print the unique racer conditions and adjust our error code. + self.error_code = checker.printUniqueRacerSets() + else: + print("There are no race conditions.") - # write to a plain file to aid in reproducing error - with open(self.options.results_file + '.unicode_error' , 'w') as f: - f.write(self.options.results_storage) + if not self.useExistingStorage(): + # Store the results from each job + for job_group in all_jobs: + for job in job_group: + job.storeResults(self.scheduler) - sys.exit(1) - except IOError: - print('\nERROR: Unable to write results due to permissions') - sys.exit(1) + # And write the results, including the stats + self.writeResults(complete=True, stats=stats) try: # Write one file, with verbose information (--file) @@ -866,33 +813,6 @@ def writeResults(self): formated_results = util.formatResult( job, self.options, result=job.getOutput(), color=False) f.write(formated_results + '\n') - # Write a separate file for each test with verbose information (--sep-files, --sep-files-ok, --sep-files-fail) - if ((self.options.ok_files and self.num_passed) - or (self.options.fail_files and self.num_failed)): - for job_group in all_jobs: - for job in job_group: - status, message, message_color, status_code, sort_value = job.getJointStatus() - - if self.options.output_dir: - output_dir = self.options.output_dir - else: - output_dir = job.getTestDir() - - output = '' - # Append input file contents to output - if self.options.include_input: - # This is a file i/o operation. We only want to do this once, and only if necessary - input_file = job.getInputFileContents() - if input_file: - output += "\n\nINPUT FILE:\n" + str(input_file) - - output += "\n\nTEST OUTPUT:" + job.getOutput() - output_file = job.getOutputFile() - formated_results = util.formatResult(job, self.options, result=output, color=False) - if output_file: - with open(output_file, 'w') as f: - f.write(formated_results) - except IOError: print('Permission error while writing results to disc') sys.exit(1) @@ -900,25 +820,148 @@ def writeResults(self): print('Error while writing results to disc') sys.exit(1) + def determineScheduler(self): + if self.options.hpc_host and not self.options.hpc: + print(f'ERROR: --hpc must be set with --hpc-host for an unknown host') + sys.exit(1) + + if self.options.hpc == 'pbs': + return 'RunPBS' + elif self.options.hpc == 'slurm': + return 'RunSlurm' + # The default scheduler plugin + return 'RunParallel' + + def initializeResults(self): + """ Initializes the results storage + + If using existing storage, this will load the previous storage. + + If not using existing storage, this will: + - Delete the previous storage, if any + - Setup the header for the storage + - Write the incomplete storage to file + """ + if self.useExistingStorage(): + if not os.path.exists(self.options.results_file): + print(f'The previous run {self.options.results_file} does not exist') + sys.exit(1) + try: + with open(self.options.results_file, 'r') as f: + self.options.results_storage = json.load(f) + except: + print(f'ERROR: Failed to load result {self.options.results_file}') + raise + + if self.options.results_storage['incomplete']: + print(f'ERROR: The previous result {self.options.results_file} is incomplete!') + sys.exit(1) + + # Adhere to previous input file syntax, or set the default + _input_file_name = 'tests' + if self.options.input_file_name: + _input_file_name = self.options.input_file_name + self.options.input_file_name = self.options.results_storage.get('input_file_name', _input_file_name) + + # Done working with existing storage + return + + # Remove the old one if it exists + if os.path.exists(self.options.results_file): + os.remove(self.options.results_file) + + # Not using previous or previous failed, initialize a new one + self.options.results_storage = {} + storage = self.options.results_storage + + # Record the input file name that was used + storage['input_file_name'] = self.options.input_file_name + + # Record that we are using --sep-files + storage['sep_files'] = self.options.sep_files + + # Record the Scheduler Plugin used + storage['scheduler'] = self.scheduler.__class__.__name__ + + # Record information on the host we can ran on + storage['hostname'] = socket.gethostname() + storage['user'] = getpass.getuser() + storage['testharness_path'] = os.path.abspath(os.path.join(os.path.abspath(__file__), '..')) + storage['testharness_args'] = sys.argv[1:] + storage['moose_dir'] = self.moose_dir + + # Record information from apptainer, if any + apptainer_container = os.environ.get('APPTAINER_CONTAINER') + if apptainer_container: + apptainer = {'path': apptainer_container} + # Information from ApptainerGenerator generated containers + var_prefix = 'MOOSE_APPTAINER_GENERATOR' + generator_name = os.environ.get(f'{var_prefix}_NAME') + if generator_name: + for suffix in ['LIBRARY', 'NAME', 'TAG', 'VERSION']: + apptainer[f'generator_{suffix.lower()}'] = os.environ.get(f'{var_prefix}_{suffix}') + storage['apptainer'] = apptainer + + # Record when the run began + storage['time'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + # Record any additional data from the scheduler + storage.update(self.scheduler.appendResultFileHeader()) + + # Record whether or not the storage is incomplete + storage['incomplete'] = True + + # Empty storage for the tests + storage['tests'] = {} + + # Write the headers + self.writeResults() + + def writeResults(self, complete=False, stats=None): + """ Forcefully write the current results to file + + Will not do anything if using existing storage. + """ + # Not writing results + if self.useExistingStorage(): + raise Exception('Should not write results') + + # Make it as complete (run is done) + self.options.results_storage['incomplete'] = not complete + # Store the stats + self.options.results_storage['stats'] = stats + + # Store to a temporary file so that we always have a working file + file = self.options.results_file + file_in_progress = self.options.results_file + '.inprogress' + try: + with open(file_in_progress, 'w') as data_file: + json.dump(self.options.results_storage, data_file, indent=2) + except UnicodeDecodeError: + print(f'\nERROR: Unable to write results {file_in_progress} due to unicode decode/encode error') + + # write to a plain file to aid in reproducing error + with open(file + '.unicode_error' , 'w') as f: + f.write(self.options.results_storage) + + raise + except IOError: + print(f'\nERROR: Unable to write results {file_in_progress} due to permissions') + raise + + # Replace the file now that it's complete + try: + os.replace(file_in_progress, file) + except: + print(f'\nERROR: Failed to move in progress results {file_in_progress} to {file}') + raise + def initialize(self, argv, app_name): # Load the scheduler plugins plugin_paths = [os.path.join(self.moose_dir, 'python', 'TestHarness'), os.path.join(self.moose_dir, 'share', 'moose', 'python', 'TestHarness')] self.factory.loadPlugins(plugin_paths, 'schedulers', "IS_SCHEDULER") - self.options.queueing = False - if self.options.pbs: - # original_storage will become the results file for each test being launched by PBS, and will be - # saved in the same directory as the test spec file. This is so we can launch multiple 'run_tests' - # without clobbering the parent results_file. Meanwhile, the new results_file is going to be - # renamed to whatever the user decided to identify their PBS launch with. - self.original_storage = self.options.results_file - self.options.results_file = os.path.abspath(self.options.pbs) - self.options.queueing = True - scheduler_plugin = 'RunPBS' - - # The default scheduler plugin - else: - scheduler_plugin = 'RunParallel' + scheduler_plugin = self.determineScheduler() # Augment the Scheduler params with plugin params plugin_params = self.factory.validParams(scheduler_plugin) @@ -930,6 +973,7 @@ def initialize(self, argv, app_name): # Create the scheduler self.scheduler = self.factory.create(scheduler_plugin, self, plugin_params) + # Now that the scheduler is setup, initialize the results storage # Save executable-under-test name to self.executable exec_suffix = 'Windows' if platform.system() == 'Windows' else '' self.executable = app_name + '-' + self.options.method + exec_suffix @@ -954,47 +998,20 @@ def initialize(self, argv, app_name): mydir = os.path.dirname(os.path.realpath(__file__)) self.executable = os.path.join(mydir, '../../../..', 'bin', self.executable) - # Save the output dir since the current working directory changes during tests - self.output_dir = os.path.join(os.path.abspath(os.path.dirname(sys.argv[0])), self.options.output_dir) - # Create the output dir if they ask for it. It is easier to ask for forgiveness than permission if self.options.output_dir: try: - os.makedirs(self.output_dir) + os.makedirs(self.options.output_dir) except OSError as ex: if ex.errno == errno.EEXIST: pass else: raise - # Use a previous results file, or declare the variable - self.options.results_storage = {} - if self.useExistingStorage(): - with open(self.options.results_file, 'r') as f: - try: - self.options.results_storage = json.load(f) - - # Adhere to previous input file syntax, or set the default - _input_file_name = 'tests' - if self.options.input_file_name: - _input_file_name = self.options.input_file_name - self.options.input_file_name = self.options.results_storage.get('INPUT_FILE_NAME', _input_file_name) - - except ValueError: - # This is a hidden file, controled by the TestHarness. So we probably shouldn't error - # and exit. Perhaps a warning instead, and create a new file? Down the road, when - # we use this file for PBS etc, this should probably result in an exception. - print(('INFO: Previous %s file is damaged. Creating a new one...' % (self.results_storage))) + # Initialize the results storage or load the previous results + self.initializeResults() def useExistingStorage(self): """ reasons for returning bool if we should use a previous results_storage file """ - if (os.path.exists(self.options.results_file) - and (self.options.failed_tests or self.options.pbs or self.options.show_last_run)): - return True - elif ((self.options.failed_tests or self.options.show_last_run) - and not os.path.exists(self.options.results_file)): - print('A previous run does not exist') - sys.exit(1) - elif os.path.exists(self.options.results_file): - os.remove(self.options.results_file) + return self.options.failed_tests or self.options.show_last_run ## Parse command line options and assign them to self.options def parseCLArgs(self, argv): @@ -1059,9 +1076,7 @@ def parseCLArgs(self, argv): outputgroup.add_argument('--show-directory', action='store_true', dest='show_directory', help='Print test directory path in out messages') outputgroup.add_argument('-o', '--output-dir', nargs=1, metavar='directory', dest='output_dir', default='', help='Save all output files in the directory, and create it if necessary') outputgroup.add_argument('-f', '--file', nargs=1, action='store', dest='file', help='Write verbose output of each test to FILE and quiet output to terminal') - outputgroup.add_argument('-x', '--sep-files', action='store_true', dest='sep_files', help='Write the output of each test to a separate file. Only quiet output to terminal. This is equivalant to \'--sep-files-fail --sep-files-ok\'') - outputgroup.add_argument('--sep-files-ok', action='store_true', dest='ok_files', help='Write the output of each passed test to a separate file') - outputgroup.add_argument('-a', '--sep-files-fail', action='store_true', dest='fail_files', help='Write the output of each FAILED test to a separate file. Only quiet output to terminal.') + outputgroup.add_argument('-x', '--sep-files', action='store_true', dest='sep_files', help='Write the output of each test to a separate file. Only quiet output to terminal.') outputgroup.add_argument('--include-input-file', action='store_true', dest='include_input', help='Include the contents of the input file when writing the results of a test to a file') outputgroup.add_argument("--testharness-unittest", action="store_true", help="Run the TestHarness unittests that test the TestHarness.") outputgroup.add_argument("--json", action="store_true", dest="json", help="Dump the parameters for the testers in JSON Format") @@ -1069,16 +1084,39 @@ def parseCLArgs(self, argv): outputgroup.add_argument("--dump", action="store_true", dest="dump", help="Dump the parameters for the testers in GetPot Format") outputgroup.add_argument("--no-trimmed-output", action="store_true", dest="no_trimmed_output", help="Do not trim the output") outputgroup.add_argument("--no-trimmed-output-on-error", action="store_true", dest="no_trimmed_output_on_error", help="Do not trim output for tests which cause an error") - outputgroup.add_argument("--results-file", nargs=1, default=self.results_file, help="Save run_tests results to an alternative json file (default: %(default)s)") + outputgroup.add_argument("--results-file", nargs=1, default='.previous_test_results.json', help="Save run_tests results to an alternative json file (default: %(default)s)") outputgroup.add_argument("--show-last-run", action="store_true", dest="show_last_run", help="Display previous results without executing tests again") - queuegroup = parser.add_argument_group('Queue Options', 'Options controlling which queue manager to use') - queuegroup.add_argument('--pbs', nargs=1, action='store', metavar='name', help='Launch tests using PBS as your scheduler. You must supply a name to identify this session with') - queuegroup.add_argument('--pbs-pre-source', nargs=1, action="store", dest='queue_source_command', metavar='', help='Source specified file before launching tests') - queuegroup.add_argument('--pbs-project', nargs=1, action='store', dest='queue_project', type=str, default='moose', metavar='', help='Identify your job(s) with this project (default: %(default)s)') - queuegroup.add_argument('--pbs-queue', nargs=1, action='store', dest='queue_queue', type=str, metavar='', help='Submit jobs to the specified queue') - queuegroup.add_argument('--pbs-node-cpus', nargs=1, action='store', type=int, default=None, metavar='', help='CPUS Per Node. The default (no setting), will always use only one node') - queuegroup.add_argument('--pbs-cleanup', nargs=1, action="store", dest='queue_cleanup', metavar='name', help='Clean up files generated by supplied --pbs name') + # Options for HPC execution + hpcgroup = parser.add_argument_group('HPC Options', 'Options controlling HPC execution') + hpcgroup.add_argument('--hpc', dest='hpc', action='store', choices=['pbs', 'slurm'], help='Launch tests using a HPC scheduler') + hpcgroup.add_argument('--hpc-host', nargs='+', action='store', dest='hpc_host', metavar='', help='The host(s) to use for submitting HPC jobs') + hpcgroup.add_argument('--hpc-pre-source', nargs=1, action="store", dest='hpc_pre_source', metavar='', help='Source specified file before launching HPC tests') + hpcgroup.add_argument('--hpc-file-timeout', nargs=1, type=int, action='store', dest='hpc_file_timeout', default=300, help='The time in seconds to wait for HPC output') + hpcgroup.add_argument('--hpc-scatter-procs', nargs=1, type=int, action='store', dest='hpc_scatter_procs', default=None, help='Set to run HPC jobs with scatter placement when the processor count is this or lower') + hpcgroup.add_argument('--hpc-apptainer-bindpath', nargs=1, action='store', type=str, dest='hpc_apptainer_bindpath', help='Sets the apptainer bindpath for HPC jobs') + hpcgroup.add_argument('--hpc-apptainer-no-home', action='store_true', dest='hpc_apptainer_no_home', help='Passes --no-home to apptainer for HPC jobs') + hpcgroup.add_argument('--hpc-project', nargs=1, action='store', dest='hpc_project', type=str, default='moose', metavar='', help='Identify your job(s) with this project (default: %(default)s)') + hpcgroup.add_argument('--hpc-no-hold', nargs=1, action='store', type=bool, default=False, dest='hpc_no_hold', help='Do not pre-create hpc jobs to be held') + hpcgroup.add_argument('--pbs-queue', nargs=1, action='store', dest='hpc_queue', type=str, metavar='', help='Submit jobs to the specified queue') + + # Try to find the terminal size if we can + # Try/except here because the terminal size could fail w/o a display + term_cols = None + try: + term_cols = os.get_terminal_size().columns * 7/8 + except: + term_cols = 110 + pass + + # Optionally load in the environment controlled values + term_cols = int(os.getenv('MOOSE_TERM_COLS', term_cols)) + term_format = os.getenv('MOOSE_TERM_FORMAT', 'njcst') + + # Terminal options + termgroup = parser.add_argument_group('Terminal Options', 'Options for controlling the formatting of terminal output') + termgroup.add_argument('--term-cols', dest='term_cols', action='store', type=int, default=term_cols, help='The number columns to use in output') + termgroup.add_argument('--term-format', dest='term_format', action='store', type=str, default=term_format, help='The formatting to use when outputting job status') code = True if self.code.decode() in argv: @@ -1087,6 +1125,16 @@ def parseCLArgs(self, argv): self.options = parser.parse_args(argv[1:]) self.options.code = code + # Try to guess the --hpc option if --hpc-host is set + if self.options.hpc_host and not self.options.hpc: + hpc_host = self.options.hpc_host[0] + if 'sawtooth' in hpc_host or 'lemhi' in hpc_host: + self.options.hpc = 'pbs' + elif 'bitterroot' in hpc_host: + self.options.hpc = 'slurm' + if self.options.hpc: + print(f'INFO: Setting --hpc={self.options.hpc} for known host {hpc_host}') + self.options.runtags = [tag for tag in self.options.run.split(',') if tag != ''] # Convert all list based options of length one to scalars @@ -1100,8 +1148,6 @@ def parseCLArgs(self, argv): # Exit if options don't make any sense, print warnings if they are merely weird def checkAndUpdateCLArgs(self): opts = self.options - if opts.output_dir and not (opts.file or opts.sep_files or opts.fail_files or opts.ok_files): - print('WARNING: --output-dir is specified but no output files will be saved, use -f or a --sep-files option') if opts.group == opts.not_group: print('ERROR: The group and not_group options cannot specify the same group') sys.exit(1) @@ -1117,36 +1163,20 @@ def checkAndUpdateCLArgs(self): if opts.spec_file and not os.path.exists(opts.spec_file): print('ERROR: --spec-file supplied but path does not exist') sys.exit(1) - if opts.queue_cleanup and not opts.pbs: - print('ERROR: --queue-cleanup cannot be used without additional queue options') - sys.exit(1) - if opts.queue_source_command and not os.path.exists(opts.queue_source_command): - print('ERROR: pre-source supplied but path does not exist') - sys.exit(1) - if opts.failed_tests and not opts.pbs and not os.path.exists(opts.results_file): - print('ERROR: --failed-tests could not detect a previous run') - sys.exit(1) - if opts.pbs and opts.pedantic_checks: - print('ERROR: --pbs and --pedantic-checks cannot be used simultaneously') - sys.exit(1) - if opts.pbs and opts.jobs: - print('ERROR: --pbs and -j|--jobs cannot be used simultaneously') - sys.exit(1) - if opts.pbs and opts.extra_info: - print('ERROR: --pbs and -e (extra info) cannot be used simultaneously') - sys.exit(1) if opts.verbose and opts.quiet: print('Do not be an oxymoron with --verbose and --quiet') sys.exit(1) - # Flatten input_file_name from ['tests', 'speedtests'] to just tests if none supplied - # We can not support running two spec files during one launch into a third party queue manager. - # This is because Jobs created by spec files, have no way of accessing other jobs created by - # other spec files. They only know about the jobs a single spec file generates. - # NOTE: Which means, tests and speedtests running simultaneously currently have a chance to - # clobber each others output during normal operation!? - if opts.pbs and not opts.input_file_name: - self.options.input_file_name = 'tests' + # Setup absolute paths and output paths + if opts.output_dir: + opts.output_dir = os.path.abspath(opts.output_dir) + opts.results_file = os.path.join(opts.output_dir, opts.results_file) + else: + opts.results_file = os.path.abspath(opts.results_file) + + if opts.failed_tests and not os.path.exists(opts.results_file): + print('ERROR: --failed-tests could not detect a previous run') + sys.exit(1) # Update any keys from the environment as necessary if not self.options.method: @@ -1162,16 +1192,6 @@ def checkAndUpdateCLArgs(self): if opts.libmesh_dir: self.libmesh_dir = opts.libmesh_dir - # User wants to write all output, so unify the options involved - if opts.sep_files: - opts.ok_files = True - opts.fail_files = True - opts.quiet = True - - # User wants only failed files, so unify the options involved - elif opts.fail_files: - opts.quiet = True - def postRun(self, specs, timing): return diff --git a/python/TestHarness/__init__.py b/python/TestHarness/__init__.py index 0c9708c685c5..59cc7b489648 100644 --- a/python/TestHarness/__init__.py +++ b/python/TestHarness/__init__.py @@ -13,5 +13,6 @@ sys.exit(1) from .TestHarness import TestHarness +from .OutputInterface import OutputInterface from .TestHarness import findDepApps __all__=['TestHarness', 'findDepApps'] diff --git a/python/TestHarness/runners/HPCRunner.py b/python/TestHarness/runners/HPCRunner.py new file mode 100644 index 000000000000..86015cf2964d --- /dev/null +++ b/python/TestHarness/runners/HPCRunner.py @@ -0,0 +1,278 @@ +#* This file is part of the MOOSE framework +#* https://www.mooseframework.org +#* +#* All rights reserved, see COPYRIGHT for full restrictions +#* https://github.com/idaholab/moose/blob/master/COPYRIGHT +#* +#* Licensed under LGPL 2.1, please see LICENSE for details +#* https://www.gnu.org/licenses/lgpl-2.1.html + +import re, time, os, subprocess, yaml +from TestHarness.runners.Runner import Runner +from TestHarness import util + +class HPCRunner(Runner): + """ + Base Runner to be used with HPC schedulers (PBS, slurm) + """ + def __init__(self, job, options, run_hpc): + super().__init__(job, options) + + # The RunHPC object + self.run_hpc = run_hpc + + # The HPC job, set during spawn() + self.hpc_job = None + + # Interval in seconds for polling for job status + self.job_status_poll_time = 0.1 + + # Interval in seconds for polling for file completion + self.file_completion_poll_time = 0.1 + + def spawn(self, timer): + # The runner_run output, which is the output from what we're + # actually running, already exists as a file. So just load + # it from that file instead and don't bother loading it + # into memory + hpc_job_output_path = self.run_hpc.getHPCJobOutputPath(self.job) + self.getRunOutput().setSeparateOutputPath(hpc_job_output_path) + + # Rely on the RunHPC object to queue the job + self.hpc_job = self.run_hpc.queueJob(self.job) + + def wait(self, timer): + # The states that we should wait on. Anything else should + # be an invalid state for waiting + wait_states = [self.hpc_job.State.held, + self.hpc_job.State.queued, + self.hpc_job.State.running] + + # Poll loop waiting for the job to be finished + # This gets a structure that represents the job, and the + # polling itself is only done on occasion within RunHPC + while True: + time.sleep(self.job_status_poll_time) + with self.hpc_job.getLock(): + if self.hpc_job.state not in wait_states: + self.exit_code = self.hpc_job.exit_code + break + + # The PBS output (stdout+stderr) + output_file = self.run_hpc.getHPCJobOutputPath(self.job) + # The result file (exit code + walltime) + result_file = self.run_hpc.getHPCJobResultPath(self.job) + + # If the Job is already finished, something happened in the + # HPC scheduler so we have an invalid state for processing + if self.job.isFinished(): + return + + tester = self.job.getTester() + + # Determine the output files that we need to wait for to be complete + wait_files = set([output_file, result_file]) + # Output files needed by the Tester, only if it says we should + if tester.mustOutputExist(self.exit_code): + wait_files.update(self.job.getOutputFiles(self.options)) + # The files that we can read, but are incomplete (no terminator) + incomplete_files = set() + + # Wait for all of the files to be available + timer.start('hpc_wait_output') + waited_time = 0 + walltime = None + while wait_files or incomplete_files: + # Don't bother if we've been killed + if self.hpc_job.isKilled(): + return + + # Look for each file + for file in wait_files.copy(): + if os.path.exists(file) and os.path.isfile(file): + wait_files.discard(file) + incomplete_files.add(file) + + # Check for file completeness + for file in incomplete_files.copy(): + if self.fileIsReady(file): + # Store the result + if file == result_file: + with open(file, 'r') as f: + result = yaml.safe_load(f) + self.exit_code = result['exit_code'] + walltime = result['walltime'] + + # Delete this, we don't really need it to hang around + try: + os.remove(file) + except OSError: + pass + + # Done with this file + incomplete_files.discard(file) + + # We've waited for files for too long + if (wait_files or incomplete_files) and waited_time >= self.options.hpc_file_timeout: + self.job.setStatus(self.job.timeout, 'FILE TIMEOUT') + def print_files(files, type): + if files: + self.appendOutput(f'{type} output file(s)\n') + self.appendOutput('\n'.join(files) + '\n') + print_files(wait_files, 'Unavailable') + print_files(incomplete_files, 'Incomplete') + break + + waited_time += self.file_completion_poll_time + time.sleep(self.file_completion_poll_time) + timer.stop('hpc_wait_output') + + # If we have a walltime from output, use it instead as it'll be + # more accurate for the real runtime + if walltime: + timer = self.job.timer + start_time = timer.startTime('runner_run') + end_time = start_time + walltime + timer.reset('runner_run') + timer.start('runner_run', start_time) + timer.stop('runner_run', end_time) + + # Handle openmpi appending a null character at the end of jobs + # that return a nonzero exit code. An example of this is: + # + # -------------------------------------------------------------------------- + # MPI_ABORT was invoked on rank 0 in communicator MPI_COMM_WORLD + # Proc: [[PID,1],0] + # Errorcode: 1 + # + # NOTE: invoking MPI_ABORT causes Open MPI to kill all MPI processes. + # You may or may not see output from other processes, depending on + # exactly when Open MPI kills them. + # -------------------------------------------------------------------------- + # -------------------------------------------------------------------------- + # prterun has exited due to process rank 0 with PID 0 on node HOSTNAME calling + # "abort". This may have caused other processes in the application to be + # terminated by signals sent by prterun (as reported here). + # -------------------------------------------------------------------------- + # + # + # Where is there the null character ends up. Thus, in cases + # where we have a nonzero exit code and a MPI_ABORT, we'll try to remove these. + if self.exit_code != 0 and self.job.getTester().hasOpenMPI(): + output = self.getRunOutput().getOutput(sanitize=False) + if 'MPI_ABORT' in output: + output_changed = False + if output: + for null in ['\0', '\x00']: + prefix = '-'*74 + '\n' + prefix_with_null = prefix + null + if prefix_with_null in output: + output = output.replace(prefix_with_null, prefix, 1) + output_changed = True + if output_changed: + self.getRunOutput().setOutput(output) + + + def kill(self): + if self.hpc_job: + self.run_hpc.killHPCJob(self.hpc_job) + + def fileIsReady(self, file): + """ + Checks if a file is ready for reading. + + In summary: + - Check if the file exists + - If the file exists, make sure that it has the terminator + string (to know that we have the full file) + - Remove the terminator string + """ + # The file terminator check (to we have the up-to-date copy of the file) + # is dependent on whether or not the file is a binary + is_binary = self.isFileBinary(file) + # If this returns None, it means that the "file" command couldn't determine + # the file type, which may be the case if we have an incomplete file so + # just continue and check on the next iteration + if is_binary is None: + return False + + ending_comment = self.run_hpc.getOutputEndingComment(self.hpc_job.id) + + # Binary file + if is_binary: + with open(file, "rb+") as file: + # We'll be looking for this many characters + len_comment = len(ending_comment) + + # Move to the end and figure out the position + # back where our terminator should be + file.seek(0, os.SEEK_END) + pos = file.tell() - len_comment + + # File is shorter than our comment + if pos < 0: + return False + + # Move to the position where our terminator _should_ be + file.seek(pos) + + # We try here in the event that we're loading + # an earlier part of the file and we can't decode + try: + contents = file.read(len_comment).decode('utf-8') + except: + return False + + # Terminator isn't there + if contents != ending_comment: + return False + + # Remove the terminator + file.seek(pos) + file.truncate() + + return True + # Text file + else: + line, pos = self.getLastLine(file) + if ending_comment == line: + with open(file, "r+", encoding="utf-8") as f: + f.seek(pos) + f.truncate() + return True + + return False + + @staticmethod + def isFileBinary(file): + """ + Returns whether or not the given file is a binary file. + + If None, a failure was encountered when checking the file type. + """ + try: + call_file = subprocess.check_output(['file', '--mime-encoding', file], text=True) + except: + return None + + # Will return something like ": ", + # where =binary when the file is binary + find_binary = re.search('binary$', call_file) + return find_binary is not None + + @staticmethod + def getLastLine(file): + """ + Gets the last line of a text file and the position + in the file at which that last line is. + """ + with open(file, 'rb') as f: + try: + f.seek(-2, os.SEEK_END) + while f.read(1) != b'\n': + f.seek(-2, os.SEEK_CUR) + except OSError: # one line filecd + f.seek(0) + pos = f.tell() + line = f.readline().decode('utf-8') + return line, pos diff --git a/python/TestHarness/runners/Runner.py b/python/TestHarness/runners/Runner.py new file mode 100644 index 000000000000..89b0e36d0f32 --- /dev/null +++ b/python/TestHarness/runners/Runner.py @@ -0,0 +1,111 @@ +#* This file is part of the MOOSE framework +#* https://www.mooseframework.org +#* +#* All rights reserved, see COPYRIGHT for full restrictions +#* https://github.com/idaholab/moose/blob/master/COPYRIGHT +#* +#* Licensed under LGPL 2.1, please see LICENSE for details +#* https://www.gnu.org/licenses/lgpl-2.1.html + +import os, json +from TestHarness import OutputInterface, util + +class Runner(OutputInterface): + """ + Base class for running a process via a command. + + Used within the Tester to actually run a test. We need + this specialized so that we can either run things locally + or externally (i.e., PBS, slurm, etc on HPC) + """ + def __init__(self, job, options): + OutputInterface.__init__(self) + + # The job that this runner is for + self.job = job + # The test harness options + self.options = options + # The job's exit code, should be set after wait() + self.exit_code = None + # The output for the actual run of the job. We keep this + # separate from self.output in this Runner because HPC + # jobs always have a file output, so we want to store + # their output separately + self.run_output = OutputInterface() + + def getRunOutput(self): + """ Get the OutputInterface object for the actual run """ + return self.run_output + + def spawn(self, timer): + """ + Spawns the process. + + Wait for the process to complete with wait(). + + Should be overridden. + """ + pass + + def wait(self, timer): + """ + Waits for the process started with spawn() to complete. + + Should be overridden. + """ + pass + + def kill(self): + """ + Kills the process started with spawn() + + Should be overridden. + """ + pass + + def finalize(self): + """ + Finalizes the output, which should be called at the end of wait() + """ + # Load the redirected output files, if any + run_output = self.getRunOutput() + for file_path in self.job.getTester().getRedirectedOutputFiles(self.options): + run_output.appendOutput(util.outputHeader(f'Begin redirected output {file_path}')) + if os.access(file_path, os.R_OK): + with open(file_path, 'r+b') as f: + run_output.appendOutput(self.readOutput(f)) + else: + self.job.setStatus(self.job.error, 'FILE TIMEOUT') + self.appendOutput(f'File {file_path} unavailable') + run_output.appendOutput(util.outputHeader(f'End redirected output {file_path}')) + + def getExitCode(self): + """ + Gets the error code of the process. + """ + return self.exit_code + + def sendSignal(self, signal): + """ + Sends a signal to the process. + + Can be overridden. + """ + raise Exception('sendSignal not supported for this Runner') + + def readOutput(self, stream): + """ + Helper for reading output from a stream, and setting an error state + if the read failed. + """ + output = '' + try: + stream.seek(0) + output = stream.read().decode('utf-8') + except UnicodeDecodeError: + self.job.setStatus(self.job.error, 'non-unicode characters in output') + except: + self.job.setStatus(self.job.error, 'error reading output') + if output and output[-1] != '\n': + output += '\n' + return output diff --git a/python/TestHarness/runners/SubprocessRunner.py b/python/TestHarness/runners/SubprocessRunner.py new file mode 100644 index 000000000000..f4a4f9bc8d32 --- /dev/null +++ b/python/TestHarness/runners/SubprocessRunner.py @@ -0,0 +1,133 @@ +#* This file is part of the MOOSE framework +#* https://www.mooseframework.org +#* +#* All rights reserved, see COPYRIGHT for full restrictions +#* https://github.com/idaholab/moose/blob/master/COPYRIGHT +#* +#* Licensed under LGPL 2.1, please see LICENSE for details +#* https://www.gnu.org/licenses/lgpl-2.1.html + +import os, platform, subprocess, shlex, time +from tempfile import SpooledTemporaryFile +from signal import SIGTERM +from TestHarness.runners.Runner import Runner +from TestHarness import util + +class SubprocessRunner(Runner): + """ + Runner that spawns a local subprocess. + """ + def __init__(self, job, options): + Runner.__init__(self, job, options) + + # The output file handler + self.outfile = None + # The error file handler + self.errfile = None + # The underlying subprocess + self.process = None + + def spawn(self, timer): + tester = self.job.getTester() + use_shell = tester.specs["use_shell"] + cmd = tester.getCommand(self.options) + tester.setCommandRan(cmd) + + # Split command into list of args to be passed to Popen + if not use_shell: + cmd = shlex.split(cmd) + + self.process = None + self.outfile = SpooledTemporaryFile(max_size=1000000) # 1M character buffer + self.errfile = SpooledTemporaryFile(max_size=100000) # 100K character buffer + + process_args = [cmd] + process_kwargs = {'stdout': self.outfile, + 'stderr': self.errfile, + 'close_fds': False, + 'shell': use_shell, + 'cwd': tester.getTestDir()} + # On Windows, there is an issue with path translation when the command is passed in + # as a list. + if platform.system() == "Windows": + process_kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP + else: + process_kwargs['preexec_fn'] = os.setsid + + # Special logic for openmpi runs + if tester.hasOpenMPI(): + process_env = os.environ.copy() + + # Don't clobber state + process_env['OMPI_MCA_orte_tmpdir_base'] = self.job.getTempDirectory().name + # Allow oversubscription for hosts that don't have a hostfile + process_env['PRTE_MCA_rmaps_default_mapping_policy'] = ':oversubscribe' + + process_kwargs['env'] = process_env + + try: + self.process = subprocess.Popen(*process_args, **process_kwargs) + except Exception as e: + raise Exception('Error in launching a new task') from e + + timer.start('runner_run') + + def wait(self, timer): + self.process.wait() + + timer.stop('runner_run') + + self.exit_code = self.process.poll() + + # This should have been cleared before the job started + if self.getRunOutput().hasOutput(): + raise Exception('Runner run output was not cleared') + + # Load combined output + for file in [self.outfile, self.errfile]: + file.flush() + output = self.readOutput(file) + file.close() + + # For some reason openmpi will append a null character at the end + # when the exit code is nonzero. Not sure why this is... but remove + # it until we figure out what's broken + if file == self.errfile and self.exit_code != 0 \ + and self.job.getTester().hasOpenMPI() and len(output) > 2 \ + and output[-3:] in ['\n\0\n', '\n\x00\n']: + output = output[:-3] + + self.getRunOutput().appendOutput(output) + + def kill(self): + if self.process is not None: + try: + if platform.system() == "Windows": + from distutils import spawn + if spawn.find_executable("taskkill"): + subprocess.call(['taskkill', '/F', '/T', '/PID', str(self.process.pid)]) + else: + self.process.terminate() + else: + pgid = os.getpgid(self.process.pid) + os.killpg(pgid, SIGTERM) + except OSError: # Process already terminated + pass + + def sendSignal(self, signal): + # process.poll() returns the process's exit code if it has completed, + # and None if it is still running. This acts as a safety precaution + # against an infinite loop; this will always close. + while self.process.poll() is None: + + # tell() gives the current position in the file. If it is greater + # than zero, the binary has started running and writing output. If + # the output is blank, the moose_test binary hasn't actually started + # doing anything yet. If so, sleep briefly and check again. + if not self.outfile.tell(): + time.sleep(0.05) + + # If the output isn't blank, then we finally send the signal and exit the loop + else: + os.kill(self.process.pid, signal) + break diff --git a/python/TestHarness/schedulers/Job.py b/python/TestHarness/schedulers/Job.py index 9f5deccfff0a..8dfa1b3c383b 100644 --- a/python/TestHarness/schedulers/Job.py +++ b/python/TestHarness/schedulers/Job.py @@ -7,58 +7,154 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html -import re, os, json -import time +import itertools, re, os, time, threading, traceback from timeit import default_timer as clock from TestHarness.StatusSystem import StatusSystem from TestHarness.FileChecker import FileChecker +from TestHarness.runners.Runner import Runner +from TestHarness import OutputInterface, util +from tempfile import TemporaryDirectory +from collections import namedtuple + +from TestHarness import util + +def time_now(): + return time.time_ns() / (10 ** 9) class Timer(object): """ A helper class for testers to track the time it takes to run. - - Every call to the start method must be followed by a call to stop. """ def __init__(self): - self.starts = [] - self.ends = [] - def start(self): - """ starts the timer clock """ - self.starts.append(clock()) - def stop(self): - """ stop/pauses the timer clock """ - self.ends.append(clock()) - def cumulativeDur(self): - """ returns the total/cumulative time taken by the timer """ - diffs = [end - start for start, end in zip(self.starts, self.ends)] - return sum(diffs) - def averageDur(self): - return self.cumulativeDur() / len(self.starts) - def nRuns(self): - return len(self.starts) - def reset(self): - self.starts = [] - self.ends = [] - -class Job(object): + # Dict of time name -> (start,) or (start,end) + self.times = {} + # Threading lock for setting timers + self.lock = threading.Lock() + + @staticmethod + def time_now() -> float: + """ Helper for getting a precise now time """ + return float(time.time_ns() / (10 ** 9)) + + def start(self, name: str, at_time=None): + """ Start the given timer """ + if not at_time: + at_time = self.time_now() + with self.lock: + self.times[name] = [at_time] + + def stop(self, name: str, at_time=None): + """ End the given timer """ + if not at_time: + at_time = self.time_now() + with self.lock: + entry = self.times.get(name) + if not entry: + raise Exception(f'Missing time entry {name}') + if len(entry) > 1: + raise Exception(f'Time entry {name} already stopped') + entry.append(at_time) + + def startMain(self): + """ Get the start time for the main timer """ + self.start('main') + + def stopMain(self): + """ Get the end time for the main timer """ + self.stop('main') + + def hasTime(self, name: str): + """ Whether or not the given timer exists """ + with self.lock: + return name in self.times + + def hasTotalTime(self, name: str): + """ Whether or not the given total time exists """ + with self.lock: + entry = self.times.get(name) + if not entry: + return False + return len(entry) > 1 + + def totalTime(self, name='main'): + """ Get the total time for the given timer """ + with self.lock: + entry = self.times.get(name) + if not entry: + if name == 'main': + return 0 + raise Exception(f'Missing time entry {name}') + + if len(entry) > 1: + return entry[1] - entry[0] + return time_now() - entry[0] + + def totalTimes(self): + """ Get the total times """ + times = {} + for name, entry in self.times.items(): + times[name] = self.totalTime(name) + return times + + def startTime(self, name): + """ Get the start time """ + with self.lock: + entry = self.times.get(name) + if not entry: + raise Exception(f'Missing time entry {name}') + return entry[0] + + def reset(self, name = None): + """ Resets a given timer or all timers """ + with self.lock: + if name: + if name not in self.times: + raise Exception(f'Missing time entry {name}') + del self.times[name] + else: + self.times.clear() + + class TimeManager: + """ Context manager for timing a section """ + def __init__(self, timer, name: str): + self.timer = timer + self.name = name + def __enter__(self): + self.timer.start(self.name) + def __exit__(self, exc_type, exc_val, exc_tb): + self.timer.stop(self.name) + + def time(self, name: str): + """ Time a section using a context manager """ + return self.TimeManager(self, name) + +class Job(OutputInterface): """ The Job class is a simple container for the tester and its associated output file object, the DAG, the process object, the exit codes, and the start and end times. """ + # Iterator for producing a unique Job ID + id_iter = itertools.count() + + # Thread lock for creating output directories + mkdir_lock = threading.Lock() + + # Tuple for getJointStatus() + JointStatus = namedtuple('JointStatus', ['status', 'message', 'color', 'status_code', 'sort_value']) + def __init__(self, tester, job_dag, options): + OutputInterface.__init__(self) + + self.id = next(self.id_iter) self.options = options + self.__j_lock = threading.Lock() self.__tester = tester self.specs = tester.specs self.__job_dag = job_dag self.timer = Timer() - self.__outfile = None - self.__start_time = clock() - self.__end_time = None - self.__previous_time = None self.__joined_out = '' self.report_timer = None self.__slots = None - self.__meta_data = {} # Create a fileChecker object to be able to call filecheck methods self.fileChecker = FileChecker(self.options.input_file_name) @@ -74,7 +170,7 @@ def __init__(self, tester, job_dag, options): self.__job_message = '' ### Enumerate the job statuses we want to use - self.job_status = StatusSystem() + self.job_status = StatusSystem(locking=True) self.hold = self.job_status.hold self.queued = self.job_status.queued @@ -96,6 +192,54 @@ def __init__(self, tester, job_dag, options): # Initialize jobs with a holding status self.setStatus(self.hold) + # Whether or not we should forcefully report the status of this Job + # the next time report statuses + self.force_report_status = False + + # The object that'll actually do the run + self._runner = None + + # A temp directory for this Job, if requested + self.tmp_dir = None + + def __del__(self): + # Do any cleaning that we can (removes the temp dir for now if it exists) + self.cleanup() + + def getID(self): + """Returns the unique ID for the job""" + return self.id + + def setRunner(self, runner: Runner): + """Sets the underlying Runner object that will run the command""" + self._runner = runner + + def getLock(self): + """ Get the lock associated with this job """ + return self.__j_lock + + def getTempDirectory(self): + """ + Gets a shared temp directory that will be cleaned up for this Tester + """ + if self.tmp_dir is None: + self.tmp_dir = TemporaryDirectory(prefix='tester_') + return self.tmp_dir + + def cleanup(self): + """ + Entry point for doing any cleaning if necessary. + + Currently just cleans up the temp directory + """ + if self.tmp_dir is not None: + # Don't let this fail + try: + self.tmp_dir.cleanup() + except: + pass + self.tmp_dir = None + def getUpstreams(self): """ Return a list of all the jobs that needed to be completed before this job """ dag = self.getDAG() @@ -126,7 +270,7 @@ def getTestName(self): def getTestNameShort(self): """ Return the shorthand Test name """ - return self.getTestName().split('.')[1] + return self.__tester.getTestNameShort() def getPrereqs(self): """ Wrapper method to return the testers prereqs """ @@ -140,6 +284,10 @@ def addCaveats(self, kwargs): """ Wrapper method for setting caveats """ return self.__tester.addCaveats(kwargs) + def removeCaveat(self, caveat): + """ Wrapper method for removing caveats """ + return self.__tester.removeCaveat(caveat) + def getCaveats(self): """ Wrapper method for getting caveats """ return self.__tester.getCaveats() @@ -152,13 +300,20 @@ def getCommand(self): """ Wrapper method for returing command """ return self.__tester.getCommand(self.options) + def getCommandRan(self): + """ Wrapper method for returing command ran """ + return self.__tester.getCommandRan() + def getRunnable(self): """ Wrapper method to return getRunnable """ return self.__tester.getRunnable(self.options) - def getOutputFiles(self): - """ Wrapper method to return getOutputFiles """ - return self.__tester.getOutputFiles() + def getOutputFiles(self, options): + """ Wrapper method to return getOutputFiles (absolute path) """ + files = [] + for file in self.__tester.getOutputFiles(options): + files.append(os.path.join(self.__tester.getTestDir(), file)) + return files def getMaxTime(self): """ Wrapper method to return getMaxTime """ @@ -184,15 +339,6 @@ def getUniquePrereqs(self): unique_prereqs.append(os.path.join(self.getTestDir(), prereq)) return unique_prereqs - def addMetaData(self, **kwargs): - """ Allow derived methods to store additional data which ends up in the data storage file """ - for key, value in kwargs.items(): - self.__meta_data[key] = value - - def getMetaData(self): - """ return data stored by addMetaData """ - return self.__meta_data - def getSlots(self): """ Return the number of slots this job consumes """ if self.__slots == None: @@ -213,114 +359,319 @@ def run(self): A blocking method to handle the exit status of the process object while keeping track of the time the process was active. When the process exits, read the output and close the file. """ - - # Do not execute app, but allow processResults to commence - if not self.__tester.shouldExecute(): + tester = self.__tester + + # Start the main timer for running + self.timer.startMain() + + # Helper for exiting + def finalize(): + # Run cleanup + with self.timer.time('job_cleanup'): + self.cleanup() + # Sanitize the output from all objects + self.sanitizeAllOutput() + # Stop timing + self.timer.stopMain() + + # Set the output path if its separate and initialize the output + if self.hasSeperateOutput(): + # Need to potentially create the output directory + self.createOutputDirectory() + + # Failed to create the directory + if self.isError(): + finalize() + return + + # Set the output path for each object + for name, object in self.getOutputObjects().items(): + output_path = self.getOutputPathPrefix() + f'.{name}_out.txt' + object.setSeparateOutputPath(output_path) + object.clearOutput() + + # Helper for trying and catching + def try_catch(do, exception_name, timer_name): + with self.timer.time(timer_name): + failed = False + try: + do() + except: + trace = traceback.format_exc() + self.setStatus(self.error, f'{exception_name} EXCEPTION') + self.appendOutput(util.outputHeader('Python exception encountered') + trace) + failed = True + + if failed: + finalize() + return not failed + + # Do not execute app, but still run the tester + # This is truly awful and I really hate that it got put in here, + # please remove it if you can. + if not tester.shouldExecute(): + run_tester = lambda: tester.run(self.options, 0, '') + try_catch(run_tester, 'TESTER RUN', 'tester_run') return if self.options.pedantic_checks and self.canParallel(): # Before the job does anything, get the times files below it were last modified - self.fileChecker.get_all_files(self, self.fileChecker.getOriginalTimes()) - self.addCaveats('pedantic check') - time.sleep(1) + with self.timer.time('pedantic_init'): + self.fileChecker.get_all_files(self, self.fileChecker.getOriginalTimes()) + self.addCaveats('pedantic check') + time.sleep(1) + + with self.timer.time('tester_prepare'): + tester.prepare(self.options) + + # Verify that the working directory is available right before we execute + if not os.path.exists(tester.getTestDir()): + self.setStatus(self.error, 'WORKING DIRECTORY NOT FOUND') + finalize() + return + # Getting the command can also cause a failure, so try that + tester.getCommand(self.options) + if tester.isError(): + finalize() + return - self.__tester.prepare(self.options) + # Spawn the process + spawn = lambda: self._runner.spawn(self.timer) + if not try_catch(spawn, 'RUNNER SPAWN', 'runner_spawn'): + return - self.__start_time = clock() - self.timer.reset() - self.__tester.run(self.timer, self.options) - self.__start_time = self.timer.starts[0] - self.__end_time = self.timer.ends[-1] - self.__joined_out = self.__tester.joined_out + # Entry point for testers to do other things + post_spawn = lambda: tester.postSpawn(self._runner) + if not try_catch(post_spawn, 'TESTER POST SPAWN', 'tester_post_spawn'): + return - if self.options.pedantic_checks and self.canParallel(): - # Check if the files we checked on earlier were modified. - self.fileChecker.get_all_files(self, self.fileChecker.getNewTimes()) - self.modifiedFiles = self.fileChecker.check_changes(self.fileChecker.getOriginalTimes(), self.fileChecker.getNewTimes()) + # And wait for it to complete + wait = lambda: self._runner.wait(self.timer) + if not try_catch(wait, 'RUNNER WAIT', 'runner_wait'): + return - def killProcess(self): - """ Kill remaining process that may be running """ - self.__tester.killCommand() - - def getStartTime(self): - """ Return the time the process started """ - return self.__start_time - - def getEndTime(self): - """ Return the time the process exited """ - return self.__end_time - - def getOutput(self): - """ Return the contents of output """ - return self.__joined_out - - def getOutputFile(self): - """ Return the output file path """ - if ((self.options.pbs - or self.options.ok_files - or self.options.fail_files - or self.options.sep_files) - and (self.isPass() or self.isFail())): - (status, message, color, exit_code, sort_value) = self.getJointStatus() - output_dir = self.options.output_dir if self.options.output_dir else self.getTestDir() - output_file = os.path.join(output_dir, - '.'.join([os.path.basename(self.getTestDir()), - self.getTestNameShort().replace(os.sep, '.'), - status, - 'txt'])) - return os.path.join(output_dir, output_file) - - def setOutput(self, output): - """ Method to allow schedulers to overwrite the output if certain conditions are met """ - if (not self.__tester.outfile is None and not self.__tester.outfile.closed - and not self.__tester.errfile is None and not self.__tester.errfile.closed): + # Job error occurred, which means the Runner didn't complete + # so don't process anything else + if self.isError(): + finalize() return - # Check for invalid unicode in output - try: - json.dumps(output) + # And do finalize (really just cleans up output) + runner_finalize = lambda: self._runner.finalize() + if not try_catch(runner_finalize, 'RUNNER FINALIZE', 'runner_finalize'): + finalize() + return + + # Exit if we have bad output in the runner before running the tester + self.sanitizeAllOutput() + if self.isError(): + finalize() + return - except UnicodeDecodeError: - # convert invalid output to something json can handle - output = output.decode('utf-8','replace').encode('ascii', 'replace') + # Check if the files we checked on earlier were modified. + if self.options.pedantic_checks and self.canParallel(): + with self.timer.time('pedantic_check'): + self.fileChecker.get_all_files(self, self.fileChecker.getNewTimes()) + self.modifiedFiles = self.fileChecker.check_changes(self.fileChecker.getOriginalTimes(), + self.fileChecker.getNewTimes()) - # Alert the user that output has invalid characters - self.addCaveats('invalid characters in stdout') + # Allow derived proccessResults to process the output and set a failing status (if it failed) + runner_output = self._runner.getRunOutput().getOutput() + exit_code = self._runner.getExitCode() + run_tester = lambda: tester.run(self.options, exit_code, runner_output) + try_catch(run_tester, 'TESTER RUN', 'tester_run') - self.__joined_out = output + # Run finalize now that we're done + finalize() + + def killProcess(self): + """ Kill remaining process that may be running """ + if self._runner: + try: + self._runner.kill() + except: + pass + self.cleanup() + + def getOutputObjects(self) -> dict: + """ + Get a dict of all of the objects that contribute to output + + The key is a name which is a human readable name of the object + """ + objects = {} + if self.getRunner(): + objects['runner_run'] = self.getRunner().getRunOutput() + objects['runner'] = self.getRunner() + objects['tester'] = self.getTester() + objects['job'] = self + return objects + + def getCombinedSeparateOutputPaths(self): + """ + Gets a dict of all of the --sep-files file paths that were produced - def getActiveTime(self): - """ Return active time """ - m = re.search(r"Active time=(\S+)", self.__joined_out) - if m != None: - return float(m.group(1)) + The key is a name which is a human readable name of the object + """ + paths = {} + for name, object in self.getOutputObjects().items(): + paths[name] = object.getSeparateOutputFilePath() if object.hasOutput() else None + return paths + + def getAllOutput(self) -> dict: + """ Get all output in a dict from each object to the text output """ + output = {} + for name, object in self.getOutputObjects().items(): + output[name] = object.getOutput() + return output + + def sanitizeAllOutput(self): + """ Sanitizes the output from all output objects + + If output is retreived from these objects via getOutput() and + it contains bad output, it will throw an error. Instead of + throwing an error, we will sanitize it before hand and then + set a Job error so that we can still continue in a failed state. + """ + all_failures = [] + for name, object in self.getOutputObjects().items(): + failures = object.sanitizeOutput() + all_failures.extend([s + f' in {name}' for s in failures]) + if all_failures: + self.setStatus(self.error, ', '.join(all_failures)) + + def getOutputForScreen(self): + """ Gets the output for printing on screen """ + show_output = self.options.verbose or (self.isFail() and not self.options.quiet) or self.isError() + if not show_output: + return None + + if self.getCommandRan(): + command = self.getCommandRan() + else: + command = self.getCommand() + + output = 'Working Directory: ' + self.getTestDir() + '\nRunning command: ' + command + '\n' + + # Whether or not to skip the runner_run output, which is the output from the + # actual run (the process that the harness runs) + skip_runner_run = None + if self.options.sep_files and not self.options.verbose: + skip_runner_run = '--sep-files' + + options = self.options + specs = self.specs + + for name, object in self.getOutputObjects().items(): + object_output = object.getOutput() + + # Nothing to output + if not object_output: + continue + + # Max size of this output for trimming + # Currently only used for the runner_run output + max_size = None + + # Possibly trim or skip the runner_run output (actual process output) + if name == 'runner_run': + # Don't output the runner run + if skip_runner_run: + output += f'\nSkipping runner_run output due to {skip_runner_run}; output located at:\n' + output += object.getSeparateOutputFilePath() + '\n' + continue + + # Default trimmed output size + max_size = 100000 + # max_buffer_size is set + if specs.isValid('max_buffer_size'): + # ...to the max + if specs['max_buffer_size'] == -1: + max_size = None + # ... or to a value + else: + max_size = int(specs['max_buffer_size']) + # Disable trimmed output + if options.no_trimmed_output: + max_size = None + # Don't trim output on error, and we errored + if options.no_trimmed_output_on_error and self.isFail(): + max_size = None + + # Add a complete line break between objects + if output: + output += '\n' + # Add a header before the output starts + output += util.outputHeader(f'Begin {name} output', ending=False) + '\n' + # Add the output, trimming if needed + output += util.trimOutput(object_output, max_size=max_size) + # Add a newline if one is missing + if output[-1] != '\n': + output += '\n' + # Add a footer after the output ends + output += '\n' + util.outputHeader(f'End {name} output', ending=False) + + # Add the text name prefix + if output: + lines = output.split('\n') + joint_status = self.getJointStatus() + prefix = util.colorText(self.getTestName() + ': ', joint_status.color, + colored=self.options.colored, code=self.options.code) + output = prefix + ('\n' + prefix).join(lines) + + return output + + def getRunner(self): + """ Gets the Runner that actually runs the command """ + return self._runner + + def getOutputDirectory(self): + """ Get the directory for output for this job """ + if not self.options.output_dir: + return self.getTestDir() + return os.path.join(self.options.output_dir, self.getTestName()[:-len(self.getTestNameShort())-1]) + + def createOutputDirectory(self): + """ Create the output directory for this job, if needed """ + if not self.options.output_dir: + return + output_dir = self.getOutputDirectory() + with Job.mkdir_lock: + if not os.path.isdir(output_dir): + try: + os.makedirs(output_dir) + except OSError as ex: + if ex.errno == errno.EEXIST: + pass + else: + self.setStatus(self.error, f'DIRECTORY CREATION FAILURE') + self.appendOutput(f'Failed to create Job directory {output_dir}') + + def getOutputPathPrefix(self): + """ + Returns a file prefix that is unique to this job - def getSolveTime(self): - """ Return solve time """ - m = re.search(r"solve().*", self.__joined_out) - if m != None: - return m.group().split()[5] + Should be used for all TestHarness produced files for this job + """ + return os.path.join(self.getOutputDirectory(), self.getTestNameShort().replace(os.sep, '.')) - def setPreviousTime(self, t): + def hasSeperateOutput(self): """ - Allow an arbitrary time to be set. This is used by the QueueManager - to set the time as recorded by a previous TestHarness instance. + Whether or not this job has separate output. + + That is, whether or not we should pipe output to a file """ - self.__previous_time = t + return self.options.sep_files def getTiming(self): """ Return active time if available, if not return a comparison of start and end time """ - if self.getActiveTime(): - return self.getActiveTime() - elif self.getEndTime() and self.getStartTime(): - return self.timer.cumulativeDur() - elif self.getStartTime() and self.isRunning(): - # If the test is still running, return current run time instead - return max(0.0, clock() - self.getStartTime()) - elif self.__previous_time: - return self.__previous_time - else: - return 0.0 + # Actual execution time + if self.timer.hasTime('runner_run'): + return self.timer.totalTime('runner_run') + # Job has started + if self.timer.hasTime('main'): + return self.timer.totalTime() + return 0.0 def getStatus(self): return self.job_status.getStatus() @@ -336,8 +687,8 @@ def setStatus(self, status, message=''): def createStatus(self): return self.job_status.createStatus() - def previousTesterStatus(self, options, previous_storage=None): - return self.__tester.previousTesterStatus(options, previous_storage) + def previousTesterStatus(self): + return self.__tester.previousTesterStatus(self.options) def getStatusMessage(self): return self.__job_message @@ -361,7 +712,7 @@ def isQueued(self): return (_status == self.queued and self.isNoStatus()) \ or (_status in self.__finished_statuses and self.__tester.isQueued()) def isRunning(self): - return self.getStatus() == self.running + return self.getStatus() in self.job_status.getPendingStatuses() def isTimeout(self): return self.getStatus() == self.timeout def isPending(self): @@ -392,16 +743,73 @@ def getJointStatus(self): """ # Job has failed, or tester has no status if self.isError() or self.isNoStatus(): - return (self.getStatus().status, - self.getStatusMessage(), - self.getStatus().color, - self.getStatus().code, - self.getStatus().sort_value) + return Job.JointStatus(status=self.getStatus().status, + message=self.getStatusMessage(), + color=self.getStatus().color, + status_code=self.getStatus().code, + sort_value=self.getStatus().sort_value) # Tester has a finished status of some sort + return Job.JointStatus(status=self.__tester.getStatus().status, + message=self.__tester.getStatusMessage(), + color=self.__tester.getStatus().color, + status_code=self.__tester.getStatus().code, + sort_value=self.__tester.getStatus().sort_value) + + def storeResults(self, scheduler): + """ Store the results for this Job into the results storage """ + joint_status = self.getJointStatus() + + # Base job data + job_data = {'name' : self.getTestNameShort(), + 'long_name' : self.getTestName(), + 'timing' : self.timer.totalTimes(), + 'status' : joint_status.status, + 'status_message' : joint_status.message, + 'fail' : self.isFail(), + 'color' : joint_status.color, + 'caveats' : list(self.getCaveats()), + 'tester' : self.getTester().getResults(self.options)} + if self.hasSeperateOutput(): + job_data['output_files'] = self.getCombinedSeparateOutputPaths() else: - return (self.__tester.getStatus().status, - self.__tester.getStatusMessage(), - self.__tester.getStatus().color, - self.__tester.getStatus().code, - self.__tester.getStatus().sort_value) + job_data['output'] = self.getAllOutput() + + # Extend with data from the scheduler, if any + job_data.update(scheduler.appendResultFileJob(self)) + + # Get the entry we're loading into + test_dir_entry, test_entry = self.getTester().getResultsEntry(self.options, True) + + # Add the job data + test_entry.update(job_data) + + def loadPreviousResults(self): + """ Loads the previous results for this job for the results storage """ + # False here means don't create it + test_dir_entry, test_entry = self.getTester().getResultsEntry(self.options, False) + + # Set the tester status + tester = self.getTester() + status, message, caveats = self.previousTesterStatus() + tester.setStatus(status, message) + if caveats: + tester.addCaveats(caveats) + + # Set the previous times + self.timer.reset() + time_now = Timer.time_now() + for name, total_time in test_entry['timing'].items(): + self.timer.start(name, time_now) + self.timer.stop(name, time_now + total_time) + + # Load the output + output_files = test_entry.get('output_files') + output = test_entry.get('output') + for name, object in self.getOutputObjects().items(): + if output_files: # --sep-files + object.setSeparateOutputPath(output_files[name]) + elif output: # stored in result + object.setOutput(output[name]) + else: + raise Exception(f'Test {self.getTestName()} missing output') diff --git a/python/TestHarness/schedulers/PBScodes.py b/python/TestHarness/schedulers/PBScodes.py index d90eead12369..e2a8e8b2195c 100644 --- a/python/TestHarness/schedulers/PBScodes.py +++ b/python/TestHarness/schedulers/PBScodes.py @@ -43,35 +43,35 @@ # These errors come from src/include/job.h for OpenPBS # Negative exit status indicates that the job could not be executed. -PBS_User_EXITCODES = { '0' : 'JOB_EXEC_OK:Job execution was successful', - '-1' : 'JOB_EXEC_FAIL1:Job execution failed, before files, no retry', - '-2' : 'JOB_EXEC_FAIL2:Job execution failed, after files, no retry', - '-3' : 'JOB_EXEC_RETRY:Job execution failed, do retry', - '-4' : 'JOB_EXEC_INITABT:Job aborted on MoM initialization', - '-5' : 'JOB_EXEC_INITRST:Job aborted on MoM initialization, checkpoint, no migrate', - '-6' : 'JOB_EXEC_INITRMG:Job aborted on MoM initialization, checkpoint, ok migrate', - '-7' : 'JOB_EXEC_BADRESRT:Job restart failed', - '-10' : 'JOB_EXEC_FAILUID:Invalid UID/GID for job', - '-11' : 'JOB_EXEC_RERUN:Job was rerun', - '-12' : 'JOB_EXEC_CHKP:Job was checkpointed and killed', - '-13' : 'JOB_EXEC_FAIL_PASSWORD:Job failed due to a bad password', - '-14' : 'JOB_EXEC_RERUN_ON_SIS_FAIL:Job was requeued (if rerunnable) or deleted (if not) due to a communication failure between the primary execution host MoM and a Sister', - '-15' : 'JOB_EXEC_QUERST:Requeue job for restart from checkpoint', - '-16' : 'JOB_EXEC_FAILHOOK_RERUN:Job execution failed due to hook rejection; requeue for later retry', - '-17' : 'JOB_EXEC_FAILHOOK_DELETE:Job execution failed due to hook rejection; delete the job at end', - '-18' : 'JOB_EXEC_HOOK_RERUN:A hook requested for job to be requeued', - '-19' : 'JOB_EXEC_HOOK_DELETE:A hook requested for job to be deleted', - '-20' : 'JOB_EXEC_RERUN_MS_FAIL:Job requeued because server could not contact the primary execution host MoM', - '-21' : 'JOB_EXEC_FAIL_SECURITY:Security breach in PBS directory', - '-22' : 'JOB_EXEC_HOOKERROR:Job exec failed due to unexpected exception or hook execution timed out', - '-23' : 'JOB_EXEC_FAIL_KRB5:Error no kerberos credentials supplied', - '-24' : 'JOB_EXEC_KILL_NCPUS_BURST:Job exec failed due to exceeding ncpus (burst)', - '-25' : 'JOB_EXEC_KILL_NCPUS_SUM:Job exec failed due to exceeding ncpus (sum)', - '-26' : 'JOB_EXEC_KILL_VMEM:Job exec failed due to exceeding vmem', - '-27' : 'JOB_EXEC_KILL_MEM:Job exec failed due to exceeding mem', - '-28' : 'JOB_EXEC_KILL_CPUT:Job exec failed due to exceeding cput', - '-29' : 'JOB_EXEC_KILL_WALLTIME:Job exec failed due to exceeding walltime', - '-30' : 'JOB_EXEC_JOINJOB:Job exec failed due to join job error' } +PBS_User_EXITCODES = { 0 : ('JOB_EXEC_OK', 'Job execution was successful'), + -1 : ('JOB_EXEC_FAIL1', 'Job execution failed, before files, no retry'), + -2 : ('JOB_EXEC_FAIL2', 'Job execution failed, after files, no retry'), + -3 : ('JOB_EXEC_RETRY', 'Job execution failed, do retry'), + -4 : ('JOB_EXEC_INITABT', 'Job aborted on MoM initialization'), + -5 : ('JOB_EXEC_INITRST', 'Job aborted on MoM initialization, checkpoint, no migrate'), + -6 : ('JOB_EXEC_INITRMG', 'Job aborted on MoM initialization, checkpoint, ok migrate'), + -7 : ('JOB_EXEC_BADRESRT', 'Job restart failed'), + -10 : ('JOB_EXEC_FAILUID', 'Invalid UID/GID for job'), + -11 : ('JOB_EXEC_RERUN', 'Job was rerun'), + -12 : ('JOB_EXEC_CHKP', 'Job was checkpointed and killed'), + -13 : ('JOB_EXEC_FAIL_PASSWORD', 'Job failed due to a bad password'), + -14 : ('JOB_EXEC_RERUN_ON_SIS_FAIL', 'Job was requeued (if rerunnable) or deleted (if not) due to a communication failure between the primary execution host MoM and a Sister'), + -15 : ('JOB_EXEC_QUERST', 'Requeue job for restart from checkpoint'), + -16 : ('JOB_EXEC_FAILHOOK_RERUN', 'Job execution failed due to hook rejection; requeue for later retry'), + -17 : ('JOB_EXEC_FAILHOOK_DELETE', 'Job execution failed due to hook rejection; delete the job at end'), + -18 : ('JOB_EXEC_HOOK_RERUN', 'A hook requested for job to be requeued'), + -19 : ('JOB_EXEC_HOOK_DELETE', 'A hook requested for job to be deleted'), + -20 : ('JOB_EXEC_RERUN_MS_FAIL', 'Job requeued because server could not contact the primary execution host MoM'), + -21 : ('JOB_EXEC_FAIL_SECURITY', 'Security breach in PBS directory'), + -22 : ('JOB_EXEC_HOOKERROR', 'Job exec failed due to unexpected exception or hook execution timed out'), + -23 : ('JOB_EXEC_FAIL_KRB5', 'Error no kerberos credentials supplied'), + -24 : ('JOB_EXEC_KILL_NCPUS_BURST', 'Job exec failed due to exceeding ncpus (burst)'), + -25 : ('JOB_EXEC_KILL_NCPUS_SUM', 'Job exec failed due to exceeding ncpus (sum)'), + -26 : ('JOB_EXEC_KILL_VMEM', 'Job exec failed due to exceeding vmem'), + -27 : ('JOB_EXEC_KILL_MEM', 'Job exec failed due to exceeding mem'), + -28 : ('JOB_EXEC_KILL_CPUT', 'Job exec failed due to exceeding cput'), + -29 : ('JOB_EXEC_KILL_WALLTIME', 'Job exec failed due to exceeding walltime'), + -30 : ('JOB_EXEC_JOINJOB', 'Job exec failed due to join job error') } PBS_STATUSES = { '0' : 'UNKNOWN', 'B' : 'BEGUN', diff --git a/python/TestHarness/schedulers/QueueManager.py b/python/TestHarness/schedulers/QueueManager.py deleted file mode 100644 index 6be1b70d8fba..000000000000 --- a/python/TestHarness/schedulers/QueueManager.py +++ /dev/null @@ -1,446 +0,0 @@ -#* This file is part of the MOOSE framework -#* https://www.mooseframework.org -#* -#* All rights reserved, see COPYRIGHT for full restrictions -#* https://github.com/idaholab/moose/blob/master/COPYRIGHT -#* -#* Licensed under LGPL 2.1, please see LICENSE for details -#* https://www.gnu.org/licenses/lgpl-2.1.html - -import sys, os, json, shutil -from collections import namedtuple -from Scheduler import Scheduler -from TestHarness.StatusSystem import StatusSystem # Determin previous status - -class QueueManager(Scheduler): - """ - QueueManager is a Scheduler plugin responsible for allowing the testers to be scheduled via a - third-party queue system (like PBS). - - The QueueManager works by intercepting and altering the statuses of all but one job contained - in the group to a finished state. This affords us the behavior of only using the runner thread - pool once per group (see augmentJobs). - - Using this one unmodified job, the spec file involved is noted, and instructs the derived - scheduler how to launch this one single spec file (using --spec-file), along with any - supplied/allowable command line arguments (--re, --cli-args, --ignore, etc). - - The third-party queueing manager then executes `run_tests --spec-file /path/to/spec_file`. - - It is the results of this additional ./run_tests run, that is captured and presented to the user as - the finished result of the test. - """ - @staticmethod - def validParams(): - params = Scheduler.validParams() - return params - - def __init__(self, harness, params): - Scheduler.__init__(self, harness, params) - self.harness = harness - self.options = self.harness.getOptions() - self.__job_storage_file = self.harness.original_storage - self.__clean_args = None - self.__status_system = StatusSystem() - - def augmentJobs(self, Jobs): - """ - Filter through incomming jobs and figure out if we are launching them - or attempting to process finished results. - """ - if self.options.dry_run: - return - # Flatten the DAG. We want to easily iterate over all jobs produced by the spec file - Jobs.removeAllDependencies() - - # Perform cleanup operations and return if thats what the user wants - if self.options.queue_cleanup: - self._cleanupFiles(Jobs) - return - - # Create a namedtuple of frequently used information contained within Jobs, so we can - # more easily pass this information among our methods - job_list = Jobs.getJobs() - if job_list: - queue_data = namedtuple('JobData', ['jobs', 'job_dir', 'json_data', 'plugin']) - job_data = queue_data(jobs=Jobs, - job_dir=job_list[0].getTestDir(), - json_data=self.options.results_storage, - plugin=self.harness.scheduler.__class__.__name__) - - if self._isProcessReady(job_data): - self._setJobStatus(job_data) - - elif self._isLaunchable(job_data): - self._prepareJobs(job_data) - - def createQueueScript(self, job, template): - """ Write the launch script to disc """ - if self.options.dry_run: - return - # Get a list of prereq tests this test may have - try: - with open(self.params['queue_template'], 'r') as f: - content = f.read() - - with open(template['launch_script'], 'w') as queue_script: - - # Do all of the replacements for valid parameters - for key in template.keys(): - if key.upper() in content: - content = content.replace('<' + key.upper() + '>', str(template[key])) - - # Strip out parameters that were not supplied - for key in template.keys(): - if key.upper() not in content: - content = content.replace('<' + key.upper() + '>', '') - - queue_script.write(content) - except IOError as e: - print(e) - sys.exit(1) - - def reserveSlots(self, job, j_lock): - """ - Inherited method from the Scheduler to handle slot allocation. - QueueManager does not need an allocation system, so this method simply returns True - """ - return True - - def getBadArgs(self): - """ Arguments which should be removed from the launch script invoking ./run_tests """ - return [] - - def getBadKeyArgs(self): - """ Key/Value arguments which should be removed from the launch script invoking ./run_tests """ - return [] - - def getCores(self, job_data): - """ iterate over Jobs and collect the maximum core requirement from the group of jobs which will run """ - slots = 1 - for job in [x for x in job_data.jobs.getJobs() if not x.isSkip()]: - slots = max(slots, job.getSlots()) - - return slots - - def getMaxTime(self, job_data): - """ iterate over Jobs and increment the total allowable time needed to complete the entire group """ - total_time = 0 - for job in [x for x in job_data.jobs.getJobs() if not x.isSkip()]: - total_time += int(job.getMaxTime()) - - return total_time - - def addDirtyFiles(self, job, file_list=[]): - """ append list of files which will be generated by derived scheduler """ - _dirty_files = self.getDirtyFiles(job) - file_list.extend(_dirty_files) - file_list = list(set(file_list)) - job.addMetaData(DIRTY_FILES=file_list) - - def getDirtyFiles(self, job): - """ return list of files not indigenous to the repository which was created by third party schedulers """ - return job.getMetaData().get('DIRTY_FILES', []) - - def cleanAndModifyArgs(self): - """ - Filter out any arguments that will otherwise break the TestHarness when launched _within_ - the third party scheduler (such as --pbs) - """ - # return cached args if we have already produced clean args - if not self.__clean_args: - current_args = list(sys.argv[1:]) - - # Ask derived schedulers for any additional args we should strip from sys.args - bad_args = self.getBadArgs() - bad_keyword_args = self.getBadKeyArgs() - - # Split keyword args so we can match/remove them (the key, and its value pair) - key_value_args = [x for x in current_args if '=' in x] - for arg in key_value_args: - current_args.remove(arg) - current_args.extend(arg.split('=')) - - # Note: we are removing cli-args/ignore because we need to re-encapsulate them below - bad_keyword_args.extend(['--spec-file', '-i', '--cli-args', '-j', '-l', '-o', '--output-dir', '--ignore', '--re']) - - # remove the key=value pair argument - for arg in bad_keyword_args: - if arg in current_args: - key = current_args.index(arg) - del current_args[key:key+2] - - # Special: re-encapsulate --cli-args - if self.options.cli_args: - current_args.extend(['--cli-args', '"%s"' % self.options.cli_args]) - if self.options.ignored_caveats: - current_args.extend(['--ignore', '"%s"' % self.options.ignored_caveats]) - if self.options.reg_exp: - current_args.extend(['--re', '"%s"' % self.options.reg_exp]) - - # remove any specified positional arguments - for arg in bad_args: - if arg in current_args: - current_args.remove(arg) - - self.__clean_args = current_args - - return self.__clean_args - - def getRunTestsCommand(self, job, cpus): - """ return the command necessary to launch the TestHarness within the third party scheduler """ - - # Build ['/path/to/run_tests', '-j', '#'] - command = [os.path.join(self.harness.run_tests_dir, 'run_tests'), - '-j', str(cpus)] - - # get current sys.args we are allowed to include when we launch run_tests - args = list(self.cleanAndModifyArgs()) - - # Build [, '--spec-file' ,/path/to/tests', '-o', '/path/to', '--sep-files'] - args.extend(['--spec-file', - os.path.join(job.getTestDir(), - self.options.input_file_name), - '-o', job.getTestDir(), - '--sep-files']) - - # Build [, ] - command.extend(args) - - return command - - def hasQueuingFailed(self, job_data): - """ Determine if the third party scheduler killed the job prematurely """ - return False - - def _isProcessReady(self, job_data): - """ - Return bool on `run_tests --spec_file` submission results being available. Due to the - way the TestHarness writes to this results file (when the TestHarness exits), this file, - when available, means every test contained therein is finished in some form or another. - - If the result file does not exist, determine if it ever will exist. Tests which can fall - into this group, are those which were: skipped, deleted, silent, etc during the initial - launch phase. - """ - # No session file. Return immediately. - if not job_data.json_data.get(job_data.job_dir, False): - return False - - is_ready = True - # Job group exists in queue session and was apart of the queueing process - job_meta = job_data.json_data.get(job_data.job_dir, {}) - scheduler = job_data.json_data.get('SCHEDULER', '') - if job_meta: - # result file exists (jobs are finished) - if os.path.exists(os.path.join(job_data.job_dir, self.__job_storage_file)): - pass - - # ask derived scheduler if this job has failed - elif self.hasQueuingFailed(job_data): - for job in job_data.jobs.getJobs(): - job.setStatus(job.error) - is_ready = False - - # result does not yet exist but will in the future - else: - for job in job_data.jobs.getJobs(): - tester = job.getTester() - if tester.isSilent(): - continue - - status, message, caveats = job.previousTesterStatus(self.options, job_data.json_data) - if status == self.__status_system.skip or status == self.__status_system.no_status: - tester.setStatus(status, 'SKIP') - else: - tester.setStatus(status, job_meta[scheduler]['STATUS']) - if caveats: - tester.addCaveats(caveats) - status_message = tester.getStatusMessage() - - # This single job will enter the runner thread pool - if status_message == "LAUNCHING": - tester.setStatus(tester.queued) - - is_ready = False - - # Job group not originally launched - else: - for job in job_data.jobs.getJobs(): - tester = job.getTester() - status, message, caveats = job.previousTesterStatus(self.options, job_data.json_data) - tester.setStatus(status, message) - if caveats: - tester.addCaveats(caveats) - - if tester.isNoStatus(): - tester.setStatus(tester.silent) - is_ready = False - - if not is_ready: - for job in job_data.jobs.getJobs(): - job.setStatus(job.finished) - - return is_ready - - def _isLaunchable(self, job_data): - """ bool if jobs are ready to launch """ - # results data exists (set during scheduler plugin initialization), so do no launch again - if job_data.json_data.get(job_data.job_dir, False): - return False - - return True - - def _prepareJobs(self, job_data): - """ - Prepare jobs for launch. - - Grab an arbitrary job and record any necessary information the third party - queueing systems requires for launch (walltime, ncpus, etc). Set all other - jobs to a finished state. The arbitrary job selected will be the only job - which enters the runner thread pool, and executes the commands neccessary - for job submission. - """ - job_list = job_data.jobs.getJobs() - - # Clear any caveats set (except skips). As they do not apply during job submission - for job in [x for x in job_list if not x.isSkip()]: - job.clearCaveats() - - if job_list: - launchable_jobs = [x for x in job_list if not x.isFinished()] - if launchable_jobs: - executor_job = job_list.pop(job_list.index(launchable_jobs.pop(0))) - scheduler_meta = {job_data.plugin : {'QUEUEING_NCPUS' : self.getCores(job_data), - 'QUEUEING_MAXTIME' : self.getMaxTime(job_data)} - } - self.options.results_storage[executor_job.getTestDir()] = scheduler_meta - - executor_job.setStatus(executor_job.hold) - for job in launchable_jobs: - tester = job.getTester() - tester.setStatus(tester.queued, 'LAUNCHING') - job.setStatus(job.finished) - - def _prevJobGroupFinished(self, jobs): - """ Loop through jobs and return immediately if any one job has a finished status """ - for job in jobs: - # ignore detection of skipped/silent/deleted finished statuses. - if job.isSilent() or job.isSkip(): - continue - (key, value) = job.getTestDir(), job.getTestName() - previous_status = self.__status_system.createStatus(self.options.results_storage[key][value]['STATUS']) - if (self.__status_system.isValid(previous_status) - and previous_status not in self.__status_system.getPendingStatuses()): - return True - return False - - def _setJobStatus(self, job_data): - """ - Read the json results file for the finished submitted job group, and match our - job statuses with the results found therein. - """ - job_list = job_data.jobs.getJobs() - if job_list: - testdir_json = os.path.join(job_data.job_dir, self.__job_storage_file) - - with open(testdir_json, 'r') as f: - try: - # Determine if we've already recorded the results for this job group - if self._prevJobGroupFinished(job_list): - results = self.options.results_storage - else: - results = json.load(f) - except ValueError: - print('Unable to parse json file: %s' % (testdir_json)) - sys.exit(1) - - group_results = results[job_data.job_dir] - - # Continue to store previous third-party queueing data - job_meta = self.options.results_storage[job_data.job_dir] - job_list[0].addMetaData(**{job_data.plugin : job_meta[job_data.plugin]}) - job_meta[job_data.plugin]['STATUS'] = 'FINISHED' - - for job in job_list: - # Perhaps the user is filtering this job (--re, --failed-tests, etc) - tester = job.getTester() - job.setStatus(job.finished) - if tester.isSilent() or tester.isSkip(): - continue - elif self.options.failed_tests and tester.isPass(): - tester.setStatus(tester.silent) - continue - - if group_results.get(job.getTestName(), {}): - job_results = group_results[job.getTestName()] - status, message, caveats = job.previousTesterStatus(self.options, results) - tester.setStatus(status, message) - if caveats: - tester.addCaveats(caveats) - - # Recover useful job information from job results - job.setPreviousTime(job_results['TIMING']) - - # Read output file (--sep-files-ok|fail) - if job.getOutputFile() and os.path.exists(job.getOutputFile()): - self.addDirtyFiles(job, [job.getOutputFile()]) - if (self.options.reg_exp - or self.options.failed_tests - or self.options.verbose) and not self.options.quiet: - with open(job.getOutputFile(), 'r') as outfile: - job.setOutput(outfile.read()) - else: - job.setOutput(f'See error in file: {job.getOutputFile()}') - else: - job.setOutput('Output file is not available, or was never generated') - - # This is a newly added test in the spec file, which was not a part of original launch - else: - tester.addCaveats('not originally launched') - tester.setStatus(tester.skip) - - def _setSilentForClean(self, Jobs): - """ silence and set a finished status for all testers """ - job_list = Jobs.getJobs() - for job in job_list: - tester = job.getTester() - tester.setStatus(tester.silent) - job.setStatus(job.finished) - return job_list - - def _cleanupFiles(self, Jobs): - """ perform cleanup operations """ - job_list = self._setSilentForClean(Jobs) - top_job_key = job_list[0].getTestDir() - plugin = self.harness.scheduler.__class__.__name__ - - # Top Job (entire TestDir group) not originally part of what was launched - # (not launched due to: --re, -i --spec-file) - if top_job_key not in self.options.results_storage.keys(): - return - # All jobs ended up being skipped in this group - # (compiler!=gcc, heavy, petsc_version, etc) - elif plugin not in self.options.results_storage[top_job_key].keys(): - return - - # Build file_list with files we should delete - file_list = [os.path.join(top_job_key, self.options.results_file)] - job_meta = self.options.results_storage[top_job_key] - scheduler_meta = job_meta[plugin] - file_list.extend(scheduler_meta.get('DIRTY_FILES', [])) - for test_dir_key, meta in job_meta.items(): - if type(meta) == type({}) and 'META_DATA' in meta.keys(): - file_list.extend(meta["META_DATA"].get("DIRTY_FILES", [])) - - # Delete files generated by jobs and the scheduler - for dirty_file in file_list: - # Safty check. Any indigenous file generated by QueueManager should only exist in the tester directory - if os.path.dirname(dirty_file) == top_job_key: - try: - if os.path.isdir(dirty_file): - shutil.rmtree(dirty_file) - else: - os.remove(dirty_file) - except OSError: - pass diff --git a/python/TestHarness/schedulers/RunHPC.py b/python/TestHarness/schedulers/RunHPC.py new file mode 100644 index 000000000000..b8530b36e932 --- /dev/null +++ b/python/TestHarness/schedulers/RunHPC.py @@ -0,0 +1,973 @@ +#* This file is part of the MOOSE framework +#* https://www.mooseframework.org +#* +#* All rights reserved, see COPYRIGHT for full restrictions +#* https://github.com/idaholab/moose/blob/master/COPYRIGHT +#* +#* Licensed under LGPL 2.1, please see LICENSE for details +#* https://www.gnu.org/licenses/lgpl-2.1.html + +import urllib.parse +from RunParallel import RunParallel +import threading, os, re, sys, datetime, shlex, socket, threading, time, urllib, contextlib +from enum import Enum +import paramiko +import jinja2 +import statistics +import contextlib +from multiprocessing.pool import ThreadPool +from TestHarness import util + +class HPCJob: + # The valid job states for a HPC job + State = Enum('State', ['waiting', 'held', 'queued', 'running', 'done', 'killed']) + + """ + Structure that represents the cached information about an HPC job + """ + def __init__(self, job): + # The underlying Job + self.job = job + # The ID of the HPC job + self.id = None + # The command that was ran within the job + self.command = None + # The state that this job is in + self.state = self.State.waiting + # The exit code of the command that was ran (if any) + self.exit_code = None + # The number of times the job has been resubmitted + self.num_resubmit = 0 + # Lock for accessing this object + self.lock = threading.Lock() + + def getLock(self): + """ + Gets the lock for this object. + """ + return self.lock + + def get(self, key): + """ + Thread-safe getter for a key + """ + with self.getLock(): + return getattr(self, key) + + def getState(self): + """ + Thread-safe getter for the state + """ + return self.get('state') + + def isKilled(self): + """ + Thread-safe getter for whether or not this was killed + """ + return self.getState() == self.State.killed + + def reset(self): + """ + Resets the job state + + Not thread safe; should be called within a lock + """ + self.id = None + self.command = None + self.state = self.State.waiting + self.exit_code = None + +class RunHPC(RunParallel): + # The types for the pools for calling HPC commands + CallHPCPoolType = Enum('CallHPCPoolType', ['submit', 'queue', 'status', 'kill']) + + """ + Base scheduler for jobs that are ran on HPC. + """ + def __init__(self, harness, params): + super().__init__(harness, params) + + self.params = params + self.options = harness.getOptions() + + # We don't want to report long running jobs here because we will + # manually set jobs as RUNNING as we notice their HPC status change + self.report_long_jobs = False + # We don't want to enforce the timeout here because we don't want to + # check it while the jobs are queued and HPC itself will handle the + # timeout because the job itself will be forcefully killed by HPC + self.enforce_timeout = False + + # Lock for accessing self.hpc_jobs + self.hpc_jobs_lock = threading.Lock() + # How often to poll (in sec) for status updates in getHPCJob() + self.hpc_jobs_update_interval = 5 + # How many HPC jobs to update at a time in updateHPCJobs() + # This needs to be an option because PBS is awful + self.update_hpc_jobs_chunk_size = 50 + # Map of Job ID -> HPCJob + self.hpc_jobs = {} + # The thread that will update the HPCJobs + self.hpc_jobs_updater = None + + # The pool of processes for running HPC scheduler commands + # We have a pool so that we don't overwhelm the login node + # with commands, and have a pool for each interaction type + # so that those commands only compete with commands of the + # other type + self.call_hpc_pool = {} + self.call_hpc_pool[self.CallHPCPoolType.submit] = ThreadPool(processes=5) + if not self.options.hpc_no_hold: # only used with holding jobs + self.call_hpc_pool[self.CallHPCPoolType.queue] = ThreadPool(processes=5) + for val in [self.CallHPCPoolType.status, self.CallHPCPoolType.kill]: + self.call_hpc_pool[val] = ThreadPool(processes=1) + + # The jump hostname for running commands, if any + self.ssh_hosts = self.options.hpc_host + # The SSH key to use for connections + self.ssh_key_filenames = None + # The threaded SSHClient objects, mapped by thread identifier + # Tuple of (paramiko.SSHClient, str) where str is the hostname + self.ssh_clients = None + # The lock for calling commands via SSH, + self.ssh_clients_lock = None + # Setup the jump host if provided + # We allow multitple hosts here to have backups + if self.ssh_hosts: + if isinstance(self.ssh_hosts, str): + self.ssh_hosts = [self.ssh_hosts] + + self.ssh_clients = {} + self.ssh_clients_lock = threading.Lock() + + # Try to find a key to use for each host. Paramiko doesn't + # use any non-default keys by default, so we need to search + # like this and apply them manually + self.ssh_key_filenames = {} + for host in self.ssh_hosts: + try: + ssh_config = os.path.expanduser('~/.ssh/config') + config = paramiko.SSHConfig.from_path(ssh_config).lookup(host) + identityfile = config.get('identityfile') + if identityfile is not None and len(identityfile) > 0: + self.ssh_key_filenames[host] = identityfile[-1] + except: + pass + + # Make sure that we can call commands up front, only if we're not re-running + if not self.options.show_last_run: + for val in self.CallHPCPoolType: + if self.options.hpc_no_hold and val == self.CallHPCPoolType.queue: + continue + self.callHPC(val, 'hostname') + + # Pool for submitJob(), so that we can submit jobs to be + # held in the background without blocking + self.submit_job_pool = None if self.options.hpc_no_hold else ThreadPool(processes=10) + + if os.environ.get('APPTAINER_CONTAINER'): + if not self.ssh_hosts: + print('ERROR: --hpc-host must be set when using HPC jobs within apptainer') + sys.exit(1) + if not self.options.hpc_pre_source: + default_pre_source = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'hpc_source') + self.options.hpc_pre_source = default_pre_source + print(f'INFO: Setting --hpc-pre-source={default_pre_source}') + else: + if self.options.hpc_apptainer_bindpath: + print('ERROR: --hpc-apptainer-bindpath is unused when not executing with apptainer') + sys.exit(1) + if self.options.hpc_apptainer_no_home: + print('ERROR: --hpc-apptainer-no-home is unused when not executing with apptainer') + sys.exit(1) + + if self.options.hpc_pre_source and not os.path.exists(self.options.hpc_pre_source): + print(f'ERROR: --hpc-pre-source path {self.options.hpc_pre_source} does not exist') + sys.exit(1) + if self.options.hpc and self.options.pedantic_checks: + print('ERROR: --hpc and --pedantic-checks cannot be used simultaneously') + sys.exit(1) + if self.options.hpc and self.options.jobs: + print('ERROR: --hpc and -j|--jobs cannot be used simultaneously') + sys.exit(1) + + # Load the pre-source if it exists + self.source_contents = None + if self.options.hpc_pre_source: + self.source_contents = open(self.options.hpc_pre_source, 'r').read() + + # Load the submission template + template_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'hpc_template') + self.submission_template = open(template_path, 'r').read() + + class CallHPCException(Exception): + """ + Exception class for providing extra context for HPC submission errors + """ + def __init__(self, description, host, command, result=None): + message = f'{description}' + if host: + message += f' on host "{host}"' + message += f'\nCommand: {command}' + if result: + message += f'\n\nResult:\n{result}' + super().__init__(message) + + def _getSSHClient(self, reconnect=False): + """ + Gets a SSH client owned by a thread. + + This is threaded so that we can operate a few connections at once. + """ + process = threading.get_ident() + with self.ssh_clients_lock: + if process not in self.ssh_clients or reconnect: + self.ssh_clients[process] = None + for host in self.ssh_hosts: + client = paramiko.SSHClient() + client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + key_filename = self.ssh_key_filenames.get(host) + try: + client.connect(host, key_filename=key_filename) + except Exception as e: + print(f'WARNING: Failed to connect to HPC host {host}: {e}') + continue + self.ssh_clients[process] = (client, host) + break + + client_and_host = self.ssh_clients.get(process) + if client_and_host is None: + raise Exception(f'Failed to connect to SSH host(s) {", ".join(self.ssh_hosts)}') + return client_and_host + + def _callSSH(self, command): + """ + Calls a SSH command. + + Should only be used via apply with the self.call_hpc_pool. + """ + client, host = self._getSSHClient() + + # Here we try twice, in the event that the connection was killed + retry = False + while True: + try: + client, host = self._getSSHClient(reconnect=retry) + _, stdout, stderr = client.exec_command(command) + except Exception as e: + if not retry: + retry = True + continue + raise RunHPC.CallHPCException('Failed to execute remote command', host, command) from e + break + + exit_code = stdout.channel.recv_exit_status() + result = ''.join(stdout.readlines()) + if exit_code != 0: + result += ''.join(stderr.readlines()) + full_command = f"ssh {host} '{command}'" + return exit_code, result.rstrip(), full_command + + def callHPC(self, pool_type, command: str, num_retries: int = 0, retry_time: float = 5): + """ + Wrapper for calling a HPC command (qsub, qstat, etc) that supports + SSH-ing to another host as needed when calling from within apptainer + + Set num_retires to retry the command this many times, waiting + retry_time sec between each retry. The command will only be retried + if self.callHPCShouldRetry() is True for that command. This lets + us retry commands given known failures. + + Requires the "pool" to specify which command pool to use, of the + RunHPC.CallHPCPoolType types. + """ + if not self.ssh_hosts: + raise Exception('HPC not currently supported outside of a container') + + exit_code = None + result = None + full_cmd = None + + for i in range(num_retries + 1): + exit_code, result, full_cmd = self.call_hpc_pool[pool_type].apply(self._callSSH, (command,)) + if exit_code == 0: + break + if self.callHPCShouldRetry(pool_type, result): + time.sleep(retry_time) + else: + break + + return exit_code, result, full_cmd + + def getJobSlots(self, job): + # Jobs only use one slot because they are ran externally + return 1 + + def availableSlots(self, params): + # Support managing 250 HPC jobs concurrently + return 250, False + + @staticmethod + def jobCaveat(hpc_job) -> str: + """ + Gets the caveat associated with the job ID for a HPCJob + """ + job_id = hpc_job.id + assert job_id is not None + return f'job={job_id}' if job_id.isdigit() else job_id + + def resubmitHPCJob(self, hpc_job): + """ + Resumits the given HPCJob. + + The HPCJob must have already been submitted. + + This should be called from within the derived + scheduler to resubmit. + """ + assert hpc_job.state != hpc_job.State.waiting + job = hpc_job.job + job.removeCaveat(self.jobCaveat(hpc_job)) + hpc_job.job.addCaveats('resubmitted') + hpc_job.reset() + hpc_job.num_resubmit += 1 + self.submitJob(job, False, lock=False) + + def submitJob(self, job, hold, lock=True): + """ + Method for submitting an HPC job for the given Job. + + The "hold" flag specifies whether or not to submit + the job in a held state. + + Set lock=False if calling this within a method + whether the HPC job lock is already obtained. + + Returns the resulting HPCJob. + """ + # If we're submitting this Job to be held, but the Job status isn't + # currently held, it means that we've hit job in the submit_job_pool + # that was submitted previously but has already been set to be skipped + # (likely due to a prereq failure) + # NOTE: This _is_ thread safe because StatusSystem is blocking + if hold and not job.isHold(): + return None + + with self.hpc_jobs_lock: + hpc_job = self.hpc_jobs.get(job.getID()) + + # Job hasn't been recorded yet; set up with a waiting state + if hpc_job is None: + assert lock is True + self.hpc_jobs[job.getID()] = HPCJob(job) + hpc_job = self.hpc_jobs.get(job.getID()) + + hpc_job_lock = hpc_job.getLock() if lock else contextlib.nullcontext() + with hpc_job_lock: + # Job has already been submitted + if hpc_job.state != hpc_job.State.waiting: + return hpc_job + + tester = job.getTester() + options = self.options + + # If we have a separate output directory, we might need to create this + # for the files that follow. This won't do anything if it exists and + # it is thread safe + job.createOutputDirectory() + + submission_script = self.getHPCJobSubmissionPath(job) + output_file = self.getHPCJobOutputPath(job) + result_file = self.getHPCJobResultPath(job) + + # Remove these files if they exist + for file in [submission_script, output_file, result_file]: + if os.path.exists(file): + os.remove(file) + + # Add MOOSE's python path for python scripts + moose_python = os.path.abspath(os.path.join(os.path.abspath(os.path.dirname(__file__)), '../..')) + + # Start building the jinja environment for the submission script + submission_env = {'SCHEDULER_NAME': self.getHPCSchedulerName(), + 'NAME': self.getHPCJobName(job), + 'CWD': tester.getTestDir(), + 'OUTPUT': output_file, + 'RESULT': result_file, + 'SUBMISSION_SCRIPT': submission_script, + 'WALLTIME': str(datetime.timedelta(seconds=tester.getMaxTime())), + 'PROJECT': self.options.hpc_project, + 'TEST_SPEC': tester.getSpecFile(), + 'TEST_NAME': tester.getTestNameShort(), + 'SUBMITTED_HOSTNAME': socket.gethostname(), + 'MOOSE_PYTHONPATH': moose_python, + 'NUM_PROCS': int(tester.getProcs(options)), + 'NUM_THREADS': int(tester.getThreads(options)), + 'ENDING_COMMENT': self.getOutputEndingComment(f'${self.getHPCJobIDVariable()}'), + 'JOB_ID_VARIABLE': self.getHPCJobIDVariable(), + 'PLACE': tester.getHPCPlace(options)} + if hold: + submission_env['HOLD'] = 1 + if self.options.hpc_pre_source: + submission_env['SOURCE_FILE'] = options.hpc_pre_source + if self.source_contents: + submission_env['SOURCE_CONTENTS'] = self.source_contents + + # Get the unescaped command + command = tester.getCommand(options) + + # Parse out the mpi command from the command if we're running in apptainer. + # We do this before any of the other escaping + APPTAINER_CONTAINER = os.environ.get('APPTAINER_CONTAINER') + apptainer_command_prefix = '' + if APPTAINER_CONTAINER: + mpi_command = self.parseMPICommand(command) + if mpi_command: + apptainer_command_prefix = mpi_command + command = command.replace(mpi_command, '') + + # Replace newlines, clean up spaces, and encode the command. We encode the + # command here to be able to pass it to a python script to run later without + # dealing with any substitution or evaluation within a shell. Thus, this is + # akin to the SubprocessRunner also running commands. It's a bit complicated, + # but I promise that it's much better than the alternative + command = command.replace('\n', ' ') + command = ' '.join(command.split()) + command_encoded = urllib.parse.quote(command) + + # Script used to decode the command as described above + hpc_run = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'hpc_run.py') + + # Special logic for when we're running with apptainer, in which case + # we need to manipulate the command like such + # Original command: + # New command: apptainer exec /path/to/image '' + if APPTAINER_CONTAINER: + job_command = apptainer_command_prefix + + # The root filesystem path that we're in so that we can be sure to bind + # it into the container, if not already set + if self.options.hpc_apptainer_bindpath: + bindpath = self.options.hpc_apptainer_bindpath + else: + bindpath = '/' + os.path.abspath(tester.getTestDir()).split(os.path.sep)[1] + # The apptainer command that will get sandwiched in the middle + apptainer_command = ['apptainer', 'exec', '-B', bindpath] + if self.options.hpc_apptainer_no_home: + apptainer_command.append('--no-home') + apptainer_command.append(APPTAINER_CONTAINER) + apptainer_command = shlex.join(apptainer_command) + + # Append the apptainer command along with the command to be ran + job_command += f"{apptainer_command} {hpc_run} {command_encoded}" + + # Set that we're using apptainer + submission_env['USING_APPTAINER'] = '1' + # Not in apptainer, so we can just use the escaped command as is + else: + job_command = f'{hpc_run} {command_encoded}' + + submission_env['COMMAND'] = job_command + + # The output files that we're expected to generate so that the + # HPC job can add a terminator for them so that we can verify + # they are complete on the executing host + additional_output = [result_file] + for file in tester.getOutputFiles(options): + additional_output.append(os.path.join(tester.getTestDir(), file)) + # This is a bash array, need to wrap each entry in double quotes + additional_output_wrapped = [] + for entry in additional_output: + additional_output_wrapped.append(f'"{entry}"') + submission_env['ADDITIONAL_OUTPUT_FILES'] = ' '.join(additional_output_wrapped) + + # Let the derived scheduler add additional variables + self.augmentJobSubmission(submission_env) + + # Build the script + jinja_env = jinja2.Environment() + definition_template = jinja_env.from_string(self.submission_template) + jinja_env.trim_blocks = True + jinja_env.lstrip_blocks = True + script = definition_template.render(**submission_env) + + # Write the script + open(submission_script, 'w').write(script) + + # Submission command. Here we have a simple bash loop + # that will try to wait for the file if it doesn't exist yet + submission_command = self.getHPCSubmissionCommand() + cmd = [f'cd {tester.getTestDir()}', + f'FILE="{submission_script}"', + 'for i in {1..40}', + 'do if [ -e "$FILE" ]', + f'then {self.getHPCSubmissionCommand()} $FILE', + 'exit $?', + 'else sleep 0.25', + 'fi', + 'done', + 'exit 1'] + cmd = '; '.join(cmd) + + # Do the submission; this is thread safe + exit_code, result, full_cmd = self.callHPC(self.CallHPCPoolType.submit, cmd, num_retries=5) + + # Start the queued timer if needed + if not hold: + job.timer.start('hpc_queued') + + # Set what we've ran for this job so that we can + # potentially get the context in an error + tester.setCommandRan(full_cmd) + + # Nonzero return code + if exit_code != 0: + raise self.CallHPCException(self, f'{submission_command} failed', full_cmd, result) + + # Set the HPC job state + hpc_job.id = self.parseHPCSubmissionJobID(result) + hpc_job.command = job_command + hpc_job.state = hpc_job.State.held if hold else hpc_job.State.queued + + # Job has been submitted, so set it as queued + # Here we append job_id if the ID is just a number so that it's more + # obvious what it is + job.addCaveats(self.jobCaveat(hpc_job)) + + # Print the job as it's been submitted + job_status = job.hold if hold else job.queued + self.setAndOutputJobStatus(hpc_job.job, job_status, caveats=True) + + return hpc_job + + def queueJob(self, job): + """ + Method for queuing a Job to start. + + Should be called from within the HPCRunner to get a job going. + + If the job is not submitted yet, it will submit it in a + non-held state. If the job is submitted but held, it will + release the job. + """ + # See if the job has been submitted yet in the background + with self.hpc_jobs_lock: + hpc_job = self.hpc_jobs.get(job.getID()) + + # If the updater hasn't been started yet, start it. + # We do this here because it's locked within hpc_jobs_lock + # and it means that we won't start looking for jobs until + # we have at least one job + if not self.hpc_jobs_updater: + self.hpc_jobs_updater = threading.Thread(target=self._updateHPCJobs) + self.hpc_jobs_updater.start() + + # Job has not been submitted yet, so submit it in non-held state + if hpc_job is None: + return self.submitJob(job, False) + + # Job has been submitted but is held, so queue it + with hpc_job.getLock(): + if hpc_job.state == hpc_job.State.held: + if self.options.hpc_no_hold: + raise Exception('Job should not be held with holding disabled') + + cmd = f'{self.getHPCQueueCommand()} {hpc_job.id}' + exit_code, result, full_cmd = self.callHPC(self.CallHPCPoolType.queue, cmd, num_retries=5) + if exit_code != 0: + try: + self.killHPCJob(hpc_job, lock=False) # already locked + except: + pass + raise self.CallHPCException(self, f'{cmd} failed', full_cmd, result) + + # Start the timer now that we've queued it + hpc_job.job.timer.start('hpc_queued') + + self.setHPCJobQueued(hpc_job) + + return hpc_job + + def augmentJobSubmission(self, submission_env): + """ + Entry point for derived schedulers to append to the + submission environment, which is used to populate + the submission jinja template. + """ + return + + def _updateHPCJobs(self): + """ + Function that is called in a separate thread to update the job + status given some interval. + """ + # We want to allow failure to happen once, just not twice in a row. + # This is a good sanity check for when occasionally the login + # node doesn't respod as expected + update_jobs_failed = False + + try: + while True: + # Here we want to store our own list to these objects + # so that we don't hold onto the lock while we work + # on each job individually + with self.hpc_jobs_lock: + hpc_jobs = [x for x in self.hpc_jobs.values()] + + # Get all of the HPC jobs that are currently active + active_states = [HPCJob.State.queued, HPCJob.State.running] + active_hpc_jobs = [x for x in hpc_jobs if x.getState() in active_states] + + # Helper for splitting a list into chunks. We won't update + # everything together because PBS is particularly bad + # at processing the status for a ton of jobs at once... + def in_chunks(l): + N = self.update_hpc_jobs_chunk_size + for i in range(0, len(l), N): + yield l[i:i + N] + + # Whether or not all of the updates suceeded + success = True + + # Process a subset of jobs at a time + for chunked_hpc_jobs in in_chunks(active_hpc_jobs): + # Returns whether or not it failed + if not self.updateHPCJobs(chunked_hpc_jobs): + success = False + + # At least one of the updates failed; allow this to + # happen only once + if not success: + if update_jobs_failed: + self.triggerErrorState() + print('ERROR: Failed to get HPC job status') + return + update_jobs_failed = True + else: + update_jobs_failed = False + + # Update on the interval requested, but also make sure + # that we're still running + poll_time = 0.1 + for i in range(int(self.hpc_jobs_update_interval / poll_time)): + if not self.isRunning(): + return + time.sleep(poll_time) + except: + self.triggerErrorState() + raise + + def updateHPCJobs(self, hpc_jobs): + """ + Updates the underlying jobs. + + Should be overridden and should return True or False + depending on whether or not the update succeeded. + + Should use setHPCJobRunning() and setHPCJobDone() + to trigger changes in HPC job state. + """ + raise Exception('Unimplemented updateHPCJobs()') + + def setHPCJobRunning(self, hpc_job, start_time): + """ + Sets the given HPC job as running. + + Should be called within a lock for the given HPCJob. + + This should be called within the overridden updateHPCJobs() to + set a HPCJob as running. + """ + job = hpc_job.job + timer = job.timer + + # This is currently thread safe because we only ever change + # it within updateJobs(), which is only ever executed serially + # within the thread the calls _updateHPCJobs() + hpc_job.state = hpc_job.State.running + + # The job is no longer queued as of when it started + if timer.hasTime('hpc_queued'): + queued_start_time = timer.startTime('hpc_queued') + # This can happen on slurm in < 1s, which could give us negatives + if start_time < queued_start_time: + timer.stop('hpc_queued', queued_start_time) + else: + timer.stop('hpc_queued', start_time) + # The runner job (actual walltime for the exec) as of when it started + timer.start('runner_run', start_time) + + # Print out that the job is now running + self.setAndOutputJobStatus(hpc_job.job, hpc_job.job.running, caveats=True) + + def setHPCJobQueued(self, hpc_job): + """ + Sets the given HPC job as being queued. + + Should be called within a lock for the given HPCJob. + + This can be used when the HPC scheduler re-schedules the job. + + This should be called within the overridden updateHPCJobs(). + """ + # Guard against setting this as requeued multiple times + if hpc_job.state == hpc_job.State.queued: + return + hpc_job.state = hpc_job.State.queued + + # Print out that the job is queued again + self.setAndOutputJobStatus(hpc_job.job, hpc_job.job.queued, caveats=True) + + def setHPCJobDone(self, hpc_job, exit_code, end_time): + """ + Sets the given HPC job as done. + + This should be called within the overridden updateHPCJobs(), + within a thread lock for that HPCJob. + """ + job = hpc_job.job + + hpc_job.state = hpc_job.State.done + hpc_job.exit_code = exit_code + + # The runner job (actual walltime for the exec) ends when it stopped + if job.timer.hasTime('runner_run'): + job.timer.stop('runner_run', end_time) + + # We've actually ran something now that didn't fail, so update + # the command to what was ran there + if not job.isError(): + job.getTester().setCommandRan(hpc_job.command) + + def buildRunner(self, job, options): + from TestHarness.runners.HPCRunner import HPCRunner + return HPCRunner(job, options, self) + + def augmentJobs(self, jobs): + super().augmentJobs(jobs) + + # Augment only jobs that are to be ran + for job in jobs: + if job.isHold(): + # If a job has its default time, double it. We grant a + # little more time to small jobs on HPC due to slower IO, etc + tester = job.getTester() + max_time = tester.getMaxTime() + if max_time == tester.getDefaultMaxTime(): + tester.setMaxTime(max_time * 2) + + # Add the Job to the pool to be submitted as a job in + # a held state. We do this as early as possible so that + # we can get a better priority in the HPC queue. This + # is an asynchronous call so it will happen later when + # available. If the Job actually runs before we have + # a chance to get to this in the pool, when it finally + # executes in the pool, it will do nothing because the + # HPCJob will already exist. + if not self.options.hpc_no_hold and not self.options.dry_run: + self.submit_job_pool.apply_async(self.submitJob, (job, True,)) + + def killHPCJob(self, hpc_job, lock=True): + """ + Kills the given HPCJob if it is in a state to be killed. + """ + with hpc_job.getLock() if lock else contextlib.suppress(): + if hpc_job.state in [hpc_job.State.killed, hpc_job.State.done]: + return + job_id = hpc_job.id + hpc_job.state = hpc_job.State.killed + + # Don't care about whether or not this failed + self.callHPC(self.CallHPCPoolType.kill, f'{self.getHPCCancelCommand()} {job_id}') + + def killHPCJobs(self, functor): + """ + Kills the HPC jobs the meet the criteria of the functor. + + The functor should take a single object, the HPCJob, and + should return a bool stating whether or not to kill that job. + """ + job_ids = [] + with self.hpc_jobs_lock: + for hpc_job in self.hpc_jobs.values(): + with hpc_job.getLock(): + if functor(hpc_job): + job_ids.append(hpc_job.id) + + if job_ids: + self.callHPC(self.CallHPCPoolType.kill, f'{self.getHPCCancelCommand()} {" ".join(job_ids)}') + + return len(job_ids) + + def killRemaining(self, keyboard=False): + """Kills all currently running HPC jobs""" + functor = lambda hpc_job: hpc_job.state not in [hpc_job.State.killed, hpc_job.State.done] + killed_jobs = self.killHPCJobs(functor) + if keyboard and killed_jobs: + print(f'\nAttempted to kill remaining {killed_jobs} HPC jobs...') + super().killRemaining(keyboard) + + def getHPCSchedulerName(self): + """ + Returns the name of the HPC scheduler in a simple shorthand. + + Used to produce files with a prefix of the scheduler type, i.e., + pbs_something or slurm_something. + + Should be overridden. + """ + raise Exception('Unimplemented getHPCSchedulerName()') + + def getHPCSubmissionCommand(self): + """ + Returns command used for submitting jobs. + + Should be overridden. + """ + raise Exception('Unimplemented getHPCSubmissionCommand()') + + def getHPCQueueCommand(self): + """ + Returns command used for submitting jobs. + + Should be overridden. + """ + raise Exception('Unimplemented getHPCQueueCommand()') + + def getHPCCancelCommand(self): + """ + Returns comamnd used for cancelling jobs. + + Should be overridden. + """ + raise Exception('Unimplemented getHPCCancelCommand()') + + def getHPCJobIDVariable(self): + """ + Returns the environment variable name that contains the job ID + when within a job (i.e., on a compute node). + + Should be overridden. + """ + raise Exception('Unimplemented getHPCJobIDVariable()') + + def parseHPCSubmissionJobID(self, result): + """ + Returns the job ID from the result of the submission command + (from qsub or sbatch). + + Should be overridden. + """ + raise Exception('Unimplemented parseHPCSubmissionJobID()') + + @staticmethod + def getHPCJobName(job) -> str: + """Gets the name of the HPC job given a tester + + PBS doesn't like ":" or "/", hence changing them to "." + """ + return job.getTestName().replace(':', '.').replace('/', '.') + + def getHPCJobOutputPath(self, job): + """Gets the absolute path for stdout/stderr for a HPC job""" + return job.getOutputPathPrefix() + '.hpc_out.txt' + + def getHPCJobResultPath(self, job): + """Gets the absolute path for the result (exit code, walltime) for a HPC job""" + return job.getOutputPathPrefix() + '.hpc_result' + + def getHPCJobSubmissionPath(self, job): + """Gets the aboslute path for the qsub script for a HPC job""" + return job.getOutputPathPrefix() + '.hpc_submit' + + @staticmethod + def getOutputEndingComment(job_id) -> str: + """ + Get the ending comment that is applied to all output files + that are read in order to verify that the files are fully + synced when reading during postprocessing. + """ + return f'TESTHARNESS RUNHPC FILE TERMINATOR FOR {job_id}' + + @staticmethod + def parseMPICommand(command) -> str: + """ + Helper that splits out the mpi command from a given command, if any + """ + find_mpi = re.search('^(\s+)?(mpiexec|mpirun)(\s+-(n|np)\s+\d+)?(\s+)?', command) + if find_mpi is not None: + return find_mpi.group(0) + return None + + @staticmethod + def setHPCJobError(hpc_job, message, output=None): + """ + Helper for setting an error within a HPC job. + + Should be used within the derived classes updateHPCJobs(). + """ + job = hpc_job.job + job.setStatus(job.error, message) + if output: + job.appendOutput(util.outputHeader(f'Job {hpc_job.id} {output}')) + + def waitFinish(self): + super().waitFinish() + + # Kill the remaining jobs that are held, which would exist if things + # fail and jobs that we pre-submitted were skipped due to a failed + # dependency above them + functor = lambda hpc_job: hpc_job.state == hpc_job.State.held + self.killHPCJobs(functor) + + def appendStats(self): + timer_keys = ['hpc_queued', 'hpc_wait_output'] + times = {} + for key in timer_keys: + times[key] = [] + num_resubmit = 0 + + for hpc_job in self.hpc_jobs.values(): + timer = hpc_job.job.timer + num_resubmit += hpc_job.num_resubmit + for key in timer_keys: + if timer.hasTotalTime(key): + times[key].append(timer.totalTime(key)) + + averages = {} + for key, values in times.items(): + averages[key] = statistics.mean(values) if values else 0 + + stats = super().appendStats() + stats.update({'hpc_time_queue_average': averages['hpc_queued'], + 'hpc_time_wait_output_average': averages['hpc_wait_output'], + 'hpc_num_resubmitted': num_resubmit}) + return stats + + def appendResultFooter(self, stats): + result = f'Average queue time {stats["hpc_time_queue_average"]:.1f} seconds, ' + result += f'average output wait time {stats["hpc_time_wait_output_average"]:.1f} seconds, ' + result += f'{stats["hpc_num_resubmitted"]} jobs resubmitted.' + return result + + def appendResultFileHeader(self): + entry = {'scheduler': self.options.hpc, + 'pre_source_file': self.options.hpc_pre_source, + 'pre_source': self.source_contents, + 'hosts': self.options.hpc_host if isinstance(self.options.hpc_host, list) else [self.options.hpc_host]} + return {'hpc': entry} + + def appendResultFileJob(self, job): + hpc_job = self.hpc_jobs.get(job.getID()) + if not hpc_job: + return {'hpc': None} + entry = {'id': hpc_job.id, + 'submission_script': self.getHPCJobSubmissionPath(job)} + return {'hpc': entry} + + def callHPCShouldRetry(self, pool_type, result: str): + """ + Entry point for a derived scheduler class to tell us if we can + retry a command given a failure with a certain result. + """ + return False diff --git a/python/TestHarness/schedulers/RunPBS.py b/python/TestHarness/schedulers/RunPBS.py index 1aae116fc2c6..06e09ae3db9f 100644 --- a/python/TestHarness/schedulers/RunPBS.py +++ b/python/TestHarness/schedulers/RunPBS.py @@ -7,214 +7,163 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html -import os, sys, re, json -from QueueManager import QueueManager -from TestHarness import util # to execute qsub -import math # to compute node requirement -from PBScodes import * +import re, json +import datetime +from RunHPC import RunHPC +from PBScodes import PBS_User_EXITCODES +from TestHarness import util ## This Class is responsible for maintaining an interface to the PBS scheduling syntax -class RunPBS(QueueManager): - @staticmethod - def validParams(): - params = QueueManager.validParams() - params.addParam('queue_template', os.path.join(os.path.abspath(os.path.dirname(__file__)), 'pbs_template'), "Location of the PBS template") - return params - - def __init__(self, harness, params): - QueueManager.__init__(self, harness, params) - self.params = params - self.harness = harness - self.options = self.harness.getOptions() - - def getBadKeyArgs(self): - """ arguments we need to remove from sys.argv """ - return ['--pbs'] - - def _readJobOutput(self, output_file, N=5): - """ return last few lines in output_file for job group """ - output = [] - if os.path.exists(output_file): - with open(output_file, 'r') as outfile: - for line in (outfile.readlines() [-N:]): - output.append(line) - output.append(f'Last {N} lines read. Full output file available at:\n{output_file}') - return '\n'.join(output) - - def hasQueuingFailed(self, job_data): - """ Determine if PBS killed the job prematurely """ - queue_plugin = self.__class__.__name__ - jobs = job_data.jobs.getJobs() - meta_data = job_data.json_data.get(jobs[0].getTestDir()) - launch_id = meta_data.get(queue_plugin, {}).get('ID', '').split('.')[0] - output_file = os.path.join(jobs[0].getTestDir(), 'qsub.output') - - # Job was never originally launched - if not meta_data.get(queue_plugin, False) or not launch_id: +class RunPBS(RunHPC): + """ + Scheduler class for the PBS HPC scheduler. + """ + def augmentJobSubmission(self, submission_env): + if self.options.hpc_queue: + submission_env['QUEUE'] = self.options.hpc_queue + + def updateHPCJobs(self, hpc_jobs): + # Poll for all of the jobs within a single call + cmd = ['qstat', '-xf', '-F', 'json'] + [x.id for x in hpc_jobs] + exit_code, result, _ = self.callHPC(self.CallHPCPoolType.status, ' '.join(cmd)) + if exit_code != 0: return False - # Job ran to completion - elif os.path.exists(os.path.join(jobs[0].getTestDir(), '.previous_test_results.json')): - return False - - ### Job has some other status ### - - # Check qstat for current status - qstat_command_result = util.runCommand(f'qstat -xf -F json {launch_id}') - - # Catch json parsing errors - try: - json_out = json.loads(qstat_command_result) - pbs_server = json_out['pbs_server'] - job_meta = json_out['Jobs'][f'{launch_id}.{pbs_server}'] - - # JobID no longer exists (stale after 1 week) - except json.decoder.JSONDecodeError: - # Job did not run to completion (no .previous_test_results.json file exists) - if os.path.exists(output_file): - qstat_command_result = (f'ERROR: {self._readJobOutput(output_file)}' - '\n\nMore information available in\n' - f' {output_file}\n') - - # Failed parse, and no output file. Perhaps the PBS job was canceled, deleted, etc - else: - qstat_command_result = ('ERROR: TestHarness encountered an error while' - f'determining what to make of PBS JobID {launch_id}:\n' - f'{qstat_command_result}') - - # Handle a qstat execution failure - if qstat_command_result.find('ERROR') != -1: - for job in job_data.jobs.getJobs(): - job.setOutput(f'ERROR invoking `qstat`:\n{qstat_command_result}') - job.setStatus(job.error, 'QSTAT') - return True - - # Use qstat json output to examine current status - job_result = job_meta.get('Exit_status', False) - - # Set the status as seen by qstat - meta_data[self.__class__.__name__]['STATUS'] = PBS_STATUSES[job_meta['job_state']] - - # Woops. This job was killed by PBS for some reason - if job_result and str(job_result) in PBS_User_EXITCODES.keys(): - output = f'{self._readJobOutput(output_file)}\n{PBS_User_EXITCODES[str(job_result)]}' - for job in jobs: - job.setOutput(output) - job.addCaveats(f'PBS ERROR: {job_result}') - return True - - # Capture TestHarness exceptions - elif job_result and job_result != "0": - if os.path.exists(output_file): - with open(output_file, 'r') as f: - output_string = util.readOutput(f, None, jobs[0].getTester()) - jobs[0].setOutput(output_string) - # Add a caveat to each job, explaining that one of the jobs caused a TestHarness exception - for job in jobs: - job.addCaveats('TESTHARNESS EXCEPTION') - return True - + # Parse the status from the jobs + json_result = json.loads(result) + job_results = json_result['Jobs'] + + for hpc_job in hpc_jobs: + # This job's result from the qstat command + job_result = job_results[hpc_job.id] + exit_code = job_result.get('Exit_status') + if exit_code is not None: + exit_code = int(exit_code) + state = job_result.get('job_state') + obittime = job_result.get('obittime') + + with hpc_job.getLock(): + job = hpc_job.job + + # Helper for parsing timings + def parse_time(name): + time_format = '%a %b %d %H:%M:%S %Y' + entry = job_result.get(name) + if not entry: + return None + + try: + return datetime.datetime.strptime(entry, '%a %b %d %H:%M:%S %Y').timestamp() + except: + self.setHPCJobError(hpc_job, 'FAILED TO PARSE TIMING', + f'Failed to parse time "{time}" from entry "{name}"') + return None + + # Job is queued and it has switched to running + if hpc_job.state == hpc_job.State.queued: + start_time = parse_time('stime') + if start_time: + self.setHPCJobRunning(hpc_job, start_time) + + # The job is held, so we're going to consider it a failure and + # will also try to cancel it so that it doesn't hang around + if state == 'H' and (job_result.get('Hold_Types') != 'u' or self.options.hpc_no_hold): + self.setHPCJobError(hpc_job, 'PBS JOB HELD', 'was held; killed job') + exit_code = 1 + try: + self.killHPCJob(hpc_job, lock=False) # no lock; we're already in one + except: + pass + + # Job finished before it started, so something killed it + if state == 'F' and exit_code is None: + self.setHPCJobError(hpc_job, 'PBS JOB KILLED', 'was killed') + exit_code = 1 + + # If we have a finished time or an error, we're done + if exit_code is not None: + if exit_code < 0: + name, reason = PBS_User_EXITCODES.get(exit_code, ('TERMINATED', 'Unknown reason')) + # Job failed to start outside of our submission script, so + # try it again a few times. This was implemented due to a + # common issue on lemhi + if name == 'JOB_EXEC_FAIL2' and hpc_job.num_resubmit <= 5: + self.resubmitHPCJob(hpc_job) + continue + # Job timed out; give this a special timeout status because + # it is then marked as recoverable (could try running again) + if name == 'JOB_EXEC_KILL_WALLTIME': + job.setStatus(job.timeout, 'PBS JOB TIMEOUT') + # Special status where the job failed to start due to a PBS + # issue and will be started again, so there's nothing to do + elif name in ['JOB_EXEC_HOOK_RERUN', 'JOB_EXEC_RETRY']: + self.setHPCJobQueued(hpc_job) + continue + # Everything else should be an error + else: + self.setHPCJobError(hpc_job, f'PBS ERROR: {name}', f'was terminated with reason: {reason}') + # Job was killed with a signal + elif exit_code >= 128: + self.setHPCJobError(hpc_job, 'PBS JOB KILLED', 'was killed by a signal') + + # Parse end time if possible. PBS is all over the place on this one. Sometimes + # walltime is available, sometimes it isn't. We also have obittime, but that + # time seems to be longer than the actual run. + end_time = None + # First try to get it from the walltime (sometimes this is 0...). We'll fake + # this a bit and just add the walltime to the start time + stime = parse_time('stime') + resources_used = job_result.get('resources_used') + if stime and resources_used: + walltime = resources_used.get('walltime') + if walltime: + search = re.search(r'^(\d+):(\d{2}):(\d{2})$', walltime) + if search: + walltime_sec = datetime.timedelta(hours=int(search.group(1)), + minutes=int(search.group(2)), + seconds=int(search.group(3))).total_seconds() + if walltime_sec != 0: + end_time = stime + walltime_sec + else: + self.setHPCJobError(hpc_job, 'WALLTIME PARSE ERROR', + f'Failed to parse walltime from "{walltime}"') + # If we don't have it yet, use the obit time + if not end_time: + obittime = parse_time('obittime') + if obittime: + end_time = obittime + + self.setHPCJobDone(hpc_job, exit_code, end_time) + + # Success + return True + + def getHPCSchedulerName(self): + return 'pbs' + + def getHPCSubmissionCommand(self): + return 'qsub' + + def getHPCQueueCommand(self): + return 'qrls' + + def getHPCCancelCommand(self): + return 'qdel -W force' + + def getHPCJobIDVariable(self): + return 'PBS_JOBID' + + def parseHPCSubmissionJobID(self, result): + search = re.search('^[0-9]+.[a-zA-Z0-9_-]+$', result) + if not search: + raise Exception(f'qsub has unexpected ID {result}') + return result + + def callHPCShouldRetry(self, pool_type, result: str): + # If we're submitting/releasing/getting a status and cannot connect + # to the scheduler, we can retry + if pool_type in [self.CallHPCPoolType.submit, + self.CallHPCPoolType.queue, + self.CallHPCPoolType.status]: + return 'pbs_iff: cannot connect to host' in result return False - - def _augmentTemplate(self, job): - """ populate qsub script template with paramaters """ - job_data = self.options.results_storage.get(job.getTestDir(), {}) - queue_meta = job_data.get(self.__class__.__name__, { self.__class__.__name__: {} }) - - template = {} - - # Launch script location - template['launch_script'] = os.path.join(job.getTestDir(), os.path.basename(job.getTestNameShort()) + '.qsub') - - # NCPUS - template['mpi_procs'] = queue_meta.get('QUEUEING_NCPUS', 1) - - # Compute node requirement - if self.options.pbs_node_cpus and template['mpi_procs'] > self.options.pbs_node_cpus: - nodes = template['mpi_procs']/self.options.pbs_node_cpus - template['mpi_procs'] = self.options.pbs_node_cpus - else: - nodes = 1 - template['nodes'] = math.ceil(nodes) - - # Convert MAX_TIME to hours:minutes for walltime use - max_time = queue_meta.get('QUEUEING_MAXTIME', 1) - hours = int(int(max_time) / 3600) - minutes = int(int(max_time) / 60) % 60 - template['walltime'] = '{0:02d}'.format(hours) + ':' + '{0:02d}'.format(minutes) + ':00' - - # Job Name - template['job_name'] = os.path.basename(job.getTestNameShort()) - - # PBS Project group - template['pbs_project'] = '#PBS -P %s' % (self.options.queue_project) - - # PBS Queue - if self.options.queue_queue: - template['pbs_queue'] = '#PBS -q %s' % (self.options.queue_queue) - else: - template['pbs_queue'] = '' - - # Apply source command - if self.options.queue_source_command and os.path.exists(self.options.queue_source_command): - template['pre_command'] = 'source %s || exit 1' % (os.path.abspath(self.options.queue_source_command)) - else: - template['pre_command'] = '' - - # Redirect stdout to this location - template['output'] = os.path.join(job.getTestDir(), 'qsub.output') - - # Root directory - template['working_dir'] = self.harness.base_dir - - # Command - template['command'] = ' '.join(self.getRunTestsCommand(job, template['mpi_procs'])) - - return template - - def run(self, job): - """ execute qsub and return the launch id """ - tester = job.getTester() - if self.options.dry_run: - tester.setStatus(tester.success, 'DRY_RUN') - return - - template = self._augmentTemplate(job) - job_meta = self.options.results_storage.get(job.getTestDir(), { job.getTestDir() : {} }) - self.createQueueScript(job, template) - command = ' '.join(['qsub', template['launch_script']]) - launch_results = util.runCommand(command, job.getTestDir()) - - # List of files we need to clean up when we are done - dirty_files = [template['launch_script'], - template['output'], - os.path.join(job.getTestDir(), self.harness.results_file)] - - if launch_results.find('ERROR') != -1: - # The executor job failed (so fail all jobs in this group) - job_dag = job.getDAG() - - for other_job in [x for x in job_dag.topological_sort() if x != job]: - other_job.clearCaveats() - other_tester = other_job.getTester() - other_tester.setStatus(other_tester.fail, 'launch failure') - - # This is _only_ to make the failed message more useful - tester.specs['command'] = command - tester.setStatus(tester.fail, 'QSUB Group Failure') - job.setOutput(launch_results) - - else: - # While RunPBS believes this was a successful launch, perhaps this system's PBS system - # failed to launch for some other strange reason, and didn't error (above .find(ERROR) - # routine). In which case, it helps to set some 'past tense' grammar as our result - # in our '--pbs some_name' json file - job_meta[self.__class__.__name__].update({'ID' : launch_results, - 'QSUB_COMMAND' : command, - 'NCPUS' : template['mpi_procs'], - 'WALLTIME' : template['walltime'], - 'QSUB_OUTPUT' : template['output'], - 'STATUS' : 'PREVIOUSLY LAUNCHED', - 'DIRTY_FILES' : dirty_files}) - - tester.setStatus(tester.queued, 'LAUNCHING') diff --git a/python/TestHarness/schedulers/RunParallel.py b/python/TestHarness/schedulers/RunParallel.py index 98e4d062eef9..f72fc6051d21 100644 --- a/python/TestHarness/schedulers/RunParallel.py +++ b/python/TestHarness/schedulers/RunParallel.py @@ -10,8 +10,9 @@ import traceback from TestHarness.schedulers.Scheduler import Scheduler -from TestHarness.StatusSystem import StatusSystem from TestHarness import util +from TestHarness.runners.SubprocessRunner import Runner, SubprocessRunner +from TestHarness.testers.Tester import Tester class RunParallel(Scheduler): """ @@ -28,6 +29,9 @@ def __init__(self, harness, params): def run(self, job): """ Run a tester command """ + # Build and set the runner that will actually run the commands + # This is abstracted away so we can support local runners and PBS/slurm runners + job.setRunner(self.buildRunner(job, self.options)) tester = job.getTester() @@ -35,63 +39,32 @@ def run(self, job): if self.options.dry_run: self.setSuccessfulMessage(tester) return + # Load results from a previous run elif self.options.show_last_run: - job_results = self.options.results_storage[job.getTestDir()][job.getTestName()] - status, message, caveats = job.previousTesterStatus(self.options, self.options.results_storage) - tester.setStatus(status, message) - if caveats: - tester.addCaveats(caveats) - job.setPreviousTime(job_results['TIMING']) - job.setOutput(job_results['OUTPUT']) + job.loadPreviousResults() return - output = '' - # Anything that throws while running or processing a job should be caught # and the job should fail try: # Launch and wait for the command to finish job.run() - # Was this job already considered finished? (Timeout, Crash, etc) - if job.isFinished(): - tester.cleanup() - return - - # Allow derived proccessResults to process the output and set a failing status (if it failed) - job_output = job.getOutput() - output = tester.processResults(tester.getMooseDir(), self.options, job_output) - - # If the tester requested to be skipped at the last minute, report that. - if tester.isSkip(): - output += '\n' + "#"*80 + '\nTester skipped, reason: ' + tester.getStatusMessage() + '\n' - elif tester.isFail(): - output += '\n' + "#"*80 + '\nTester failed, reason: ' + tester.getStatusMessage() + '\n' - # If the tester has not yet failed, append additional information to output - else: - # Read the output either from the temporary file or redirected files - if tester.hasRedirectedOutput(self.options): - redirected_output = util.getOutputFromFiles(tester, self.options) - output += redirected_output - - # If we asked for redirected output but none was found, we'll call that a failure - if redirected_output == '': - tester.setStatus(tester.fail, 'FILE TIMEOUT') - output += '\n' + "#"*80 + '\nTester failed, reason: ' + tester.getStatusMessage() + '\n' - + # Set the successful message + if not tester.isSkip() and not job.isFail(): self.setSuccessfulMessage(tester) - except Exception as e: - output += 'Python exception encountered:\n\n' + traceback.format_exc() - tester.setStatus(StatusSystem().error, 'TESTER EXCEPTION') - - # Clean up now that we're done - tester.cleanup() - - if job.getOutputFile(): - job.addMetaData(DIRTY_FILES=[job.getOutputFile()]) - - # Set testers output with modifications made above so it prints the way we want it - job.setOutput(output) + except: + trace = traceback.format_exc() + job.appendOutput(util.outputHeader('Python exception encountered in Job') + trace) + job.setStatus(job.error, 'JOB EXCEPTION') + + def buildRunner(self, job, options) -> Runner: + """Builds the runner for a given tester + + This exists as a method so that derived schedulers can change how they + run commands (i.e., for PBS and slurm) + """ + return SubprocessRunner(job, options) def setSuccessfulMessage(self, tester): """ properly set a finished successful message for tester """ diff --git a/python/TestHarness/schedulers/RunSlurm.py b/python/TestHarness/schedulers/RunSlurm.py new file mode 100644 index 000000000000..4acd0e65c80c --- /dev/null +++ b/python/TestHarness/schedulers/RunSlurm.py @@ -0,0 +1,119 @@ +#* This file is part of the MOOSE framework +#* https://www.mooseframework.org +#* +#* All rights reserved, see COPYRIGHT for full restrictions +#* https://github.com/idaholab/moose/blob/master/COPYRIGHT +#* +#* Licensed under LGPL 2.1, please see LICENSE for details +#* https://www.gnu.org/licenses/lgpl-2.1.html + +import re +from datetime import datetime +from RunHPC import RunHPC + +## This Class is responsible for maintaining an interface to the slurm scheduling syntax +class RunSlurm(RunHPC): + """ + Scheduler class for the slurm HPC scheduler. + """ + def __init__(self, harness, params): + super().__init__(harness, params) + + # Slurm is significantly better at job status, so we can + # update all at the same time + self.update_hpc_jobs_chunk_size = 1000 + + def updateHPCJobs(self, hpc_jobs): + # Poll for all of the jobs within a single call + active_job_ids = ','.join([x.id for x in hpc_jobs]) + cmd = ['sacct', '-j', active_job_ids, '--parsable2', '--noheader', + '-o', 'jobid,exitcode,state,reason,start,end'] + exit_code, result, _ = self.callHPC(self.CallHPCPoolType.status, ' '.join(cmd)) + if exit_code != 0: + return False + + # Parse the status from the jobs + statuses = {} + for status in result.splitlines(): + # jobid,exitcode,state,reason are split by | + status_split = status.split('|') + # Slurm has sub jobs under each job, and we only care about the top-level job + id = status_split[0] + if not id.isdigit(): + continue + # exitcode is :, where the first value is the + # exit code of the process, the second is a slurm internal code + statuses[id] = {'exitcode': int(status_split[1].split(':')[0]), + 'state': status_split[2], + 'reason': status_split[3], + 'start': status_split[4], + 'end': status_split[5]} + + # Update the jobs that we can + for hpc_job in hpc_jobs: + # Helper for parsing a time + def parse_time(time): + if time: + return datetime.strptime(time, '%Y-%m-%dT%H:%M:%S').timestamp() + return None + + # Slurm jobs are sometimes not immediately available + status = statuses.get(hpc_job.id) + if status is None: + continue + + # The slurm job state; see slurm.schedmd.com/squeue.html#lbAG + state = status['state'] + + with hpc_job.getLock(): + # Job wasn't running and it's no longer pending, so it + # is running or has at least ran + if state != 'PENDING' and hpc_job.state != hpc_job.State.running: + start_time = parse_time(status['start']) + self.setHPCJobRunning(hpc_job, start_time) + + # Job was running and isn't running anymore, so it's done + if hpc_job.state == hpc_job.State.running and state not in ['RUNNING', 'COMPLETING']: + exit_code = int(status['exitcode']) + if state == 'FAILED' and exit_code == 0: + raise Exception(f'Job {hpc_job.id} has unexpected exit code {exit_code} with FAILED state') + + job = hpc_job.job + + # Job has timed out; setting a timeout status means that this + # state is recoverable + if state == 'TIMEOUT': + job.setStatus(job.timeout, 'SLURM JOB TIMEOUT') + # If a job COMPLETED, it's done with exit code 0 so everything + # went well. If it FAILED, it finished but returned with a + # non-zero exit code, which will be handled by the Tester. + elif state not in ['FAILED', 'COMPLETED']: + self.setHPCJobError(hpc_job, f'SLURM ERROR: {state}', f'has state "{state}"') + + end_time = parse_time(status['end']) + self.setHPCJobDone(hpc_job, exit_code, end_time) + + # Success + return True + + def getHPCSchedulerName(self): + return 'slurm' + + def getHPCSubmissionCommand(self): + return 'sbatch' + + def getHPCQueueCommand(self): + return 'scontrol release' + + def getHPCCancelCommand(self): + return 'scancel' + + def getHPCJobIDVariable(self): + return 'SLURM_JOB_ID' + + def parseHPCSubmissionJobID(self, result): + search = re.search('^Submitted batch job ([0-9]+)$', result) + if not search: + raise Exception(f'Failed to parse job ID from "{result}"') + return str(search.group(1)) + diff --git a/python/TestHarness/schedulers/Scheduler.py b/python/TestHarness/schedulers/Scheduler.py index 129ea518edeb..b34832c430a1 100644 --- a/python/TestHarness/schedulers/Scheduler.py +++ b/python/TestHarness/schedulers/Scheduler.py @@ -15,6 +15,7 @@ from timeit import default_timer as clock from multiprocessing.pool import ThreadPool import threading # for thread locking and thread timers +import pyhit class SchedulerError(Exception): pass @@ -59,12 +60,7 @@ def __init__(self, harness, params): # The Scheduler class can be initialized with no "max_processes" argument and it'll default # to a soft limit. If however a max_processes is passed we'll treat it as a hard limit. # The difference is whether or not we allow single jobs to exceed the number of slots. - if params['max_processes'] == None: - self.available_slots = 1 - self.soft_limit = True - else: - self.available_slots = params['max_processes'] # hard limit - self.soft_limit = False + self.available_slots, self.soft_limit = self.availableSlots(params) self.average_load = params['average_load'] @@ -94,12 +90,18 @@ def __init__(self, harness, params): # List of lists containing all job objects entering the run_pool self.__dag_bank = [] + # Lock for __job_bank and __dag_bank + self.__bank_lock = threading.Lock() + # Total running Job and Test failures encountered self.__failures = 0 # Allow threads to set a global exception self.__error_state = False + # Lock for __error_state + self.__error_state_lock = threading.Lock() + # Private set of jobs currently running self.__active_jobs = set([]) @@ -110,12 +112,36 @@ def __init__(self, harness, params): # The last time the scheduler reported something self.last_reported_time = clock() - # True when scheduler.waitFinish() is called. This alerts the scheduler, no more jobs are - # to be scheduled. KeyboardInterrupts are then handled by the thread pools. - self.__waiting = False + # Whether or not to report long running jobs as RUNNING + self.report_long_jobs = True + # Whether or not to enforce the timeout of jobs + self.enforce_timeout = True + + def getErrorState(self): + """ + Gets the thread-safe error state. + """ + with self.__error_state_lock: + return self.__error_state + + def availableSlots(self, params): + """ + Get the number of available slots for processing jobs and + whether or not that limit is a soft or hard limit. + + Needed so that derived schedulers can modify this limit. + """ + if params['max_processes'] == None: + available_slots = 1 + soft_limit = True + else: + available_slots = params['max_processes'] # hard limit + soft_limit = False + return available_slots, soft_limit def triggerErrorState(self): - self.__error_state = True + with self.__error_state_lock: + self.__error_state = True self.run_pool.close() self.status_pool.close() @@ -124,11 +150,9 @@ def killRemaining(self, keyboard=False): with self.activity_lock: for job in self.__active_jobs: job.killProcess() + self.triggerErrorState() if keyboard: - self.triggerErrorState() self.harness.keyboard_interrupt() - else: - self.triggerErrorState() def retrieveJobs(self): """ return all the jobs the scheduler was tasked to perform work for """ @@ -140,22 +164,18 @@ def retrieveDAGs(self): def schedulerError(self): """ boolean if the scheduler prematurely exited """ - return self.__error_state and not self.maxFailures() + return self.getErrorState() and not self.maxFailures() def maxFailures(self): """ Boolean for hitting max failures """ return ((self.options.valgrind_mode and self.__failures >= self.options.valgrind_max_fails) or self.__failures >= self.options.max_fails - and not self.options.pbs) + and not self.options.hpc) def run(self, job): """ Call derived run method """ return - def notifyFinishedSchedulers(self): - """ Notify derived schedulers we are finished """ - return - def augmentJobs(self, jobs): """ Allow derived schedulers to augment jobs before they perform work. @@ -170,35 +190,60 @@ def __sortAndLaunch(self): Sort by largest DAG and launch """ sorted_jobs = sorted(self.__dag_bank, key=lambda x: len(x[1].topological_sort()), reverse=True) - for (jobs, j_dag, j_lock) in sorted_jobs: - self.queueJobs(jobs, j_lock) + for job_dag, _ in sorted_jobs: + # Allow derived schedulers access to the jobs before they launch + # We purposely call this one here so that we augment the Jobs + # in the order that they're launched + self.augmentJobs(job_dag.getJobs()) + # And launch + self.queueJobs(job_dag) + + def setAndOutputJobStatus(self, job, status, caveats=None): + """ + Sets a Job's status and forces the status to be output asap + """ + job.setStatus(status) + with job.getLock(): + job.force_report_status = True + self.handleJobStatus(job, caveats=caveats) + + def isRunning(self): + """ + Returns whether or not the scheduler is currently running jobs. + """ + if self.getErrorState(): + return False + with self.__bank_lock: + if not self.__job_bank: + return False + return True def waitFinish(self): """ Inform the Scheduler to begin running. Block until all jobs finish. """ self.__sortAndLaunch() - self.__waiting = True try: + error_state = False + # wait until there is an error, or job_bank has emptied - while self.__job_bank: - if self.__error_state: + while True: + if not self.isRunning(): break sleep(0.1) + error_state = self.getErrorState() + # Completed all jobs sanity check - if not self.__error_state and self.__job_bank: + if not error_state and self.__job_bank: raise SchedulerError('Scheduler exiting with different amount of work than what was initially tasked!') - if not self.__error_state: + if not error_state: self.run_pool.close() self.run_pool.join() self.status_pool.close() self.status_pool.join() - # allow derived schedulers to perform any exit routines - self.notifyFinishedSchedulers() - except KeyboardInterrupt: self.killRemaining(keyboard=True) @@ -213,33 +258,38 @@ def getStatusPoolState(self): def schedule(self, testers): """ Generate and submit a group of testers to a thread pool queue for execution. + + This process is serial. """ # If we are not to schedule any more jobs for some reason, return now - if self.__error_state: + if self.getErrorState(): return + # Nothing to do if there aren't any testers + if not testers: + return + + # Whether or not we have parallel scheduling + root = pyhit.load(testers[0].getSpecFile()) + parallel_scheduling = root.children[0].get('parallel_scheduling', False) # Instance our job DAG, create jobs, and a private lock for this group of jobs (testers) - jobs = JobDAG(self.options) + jobs = JobDAG(self.options, parallel_scheduling) j_dag = jobs.createJobs(testers) - j_lock = threading.Lock() - - # Allow derived schedulers access to the jobs before they launch - self.augmentJobs(jobs) # job-count to tester-count sanity check if j_dag.size() != len(testers): raise SchedulerError('Scheduler was going to run a different amount of testers than what was received (something bad happened)!') - with j_lock: - # As testers (jobs) finish, they are removed from job_bank - self.__job_bank.update(j_dag.topological_sort()) - # List of objects relating to eachother (used for thread locking this job group) - self.__dag_bank.append([jobs, j_dag, j_lock]) + # Don't need to lock below because this process is serial + # As testers (jobs) finish, they are removed from job_bank + self.__job_bank.update(j_dag.topological_sort()) + # List of objects relating to eachother (used for thread locking this job group) + self.__dag_bank.append([jobs, j_dag]) # Store all scheduled jobs self.__scheduled_jobs.append(j_dag.topological_sort()) - def queueJobs(self, jobs, j_lock): + def queueJobs(self, job_dag): """ Determine which queue jobs should enter. Finished jobs are placed in the status pool to be printed while all others are placed in the runner pool to perform work. @@ -247,19 +297,18 @@ def queueJobs(self, jobs, j_lock): A finished job will trigger a change to the Job DAG, which will allow additional jobs to become available and ready to enter the runner pool (dependency jobs). """ - state = self.getStatusPoolState() - with j_lock: - concurrent_jobs = jobs.getJobsAndAdvance() + with job_dag.getLock(): + concurrent_jobs = job_dag.getJobsAndAdvance() for job in concurrent_jobs: if job.isFinished(): if not state: - self.status_pool.apply_async(self.jobStatus, (job, jobs, j_lock)) + self.handleJobStatus(job) elif job.isHold(): if not state: job.setStatus(job.queued) - self.run_pool.apply_async(self.runJob, (job, jobs, j_lock)) + self.run_pool.apply_async(self.runJob, (job, job_dag,)) def getLoad(self): """ Method to return current load average """ @@ -275,7 +324,16 @@ def satisfyLoad(self): while self.slots_in_use > 1 and self.getLoad() >= self.average_load: sleep(1.0) - def reserveSlots(self, job, j_lock): + def getJobSlots(self, job): + """ + Gets the number of slots a job will use. + + This exists so that HPC runners can override it, as + jobs like PBS jobs only use one slot because they are + ran externally.""" + return job.getSlots() + + def reserveSlots(self, job): """ Method which allocates resources to perform the job. Returns bool if job should be allowed to run based on available resources. @@ -285,37 +343,48 @@ def reserveSlots(self, job, j_lock): self.satisfyLoad() with self.slot_lock: + job_slots = self.getJobSlots(job) + can_run = False - if self.slots_in_use + job.getSlots() <= self.available_slots: + if self.slots_in_use + job_slots <= self.available_slots: can_run = True # Check for insufficient slots -soft limit - elif job.getSlots() > self.available_slots and self.soft_limit: + elif job_slots > self.available_slots and self.soft_limit: job.addCaveats('OVERSIZED') can_run = True # Check for insufficient slots -hard limit (skip this job) - elif job.getSlots() > self.available_slots and not self.soft_limit: + elif job_slots > self.available_slots and not self.soft_limit: job.addCaveats('insufficient slots') - with j_lock: + with job.getLock(): job.setStatus(job.skip) if can_run: - self.slots_in_use += job.getSlots() + self.slots_in_use += job_slots return can_run - def handleTimeoutJob(self, job, j_lock): + def handleTimeoutJob(self, job): """ Handle jobs that have timed out """ - with j_lock: + with job.getLock(): if job.isRunning(): job.setStatus(job.timeout, 'TIMEOUT') job.killProcess() - def handleLongRunningJob(self, job, jobs, j_lock): - """ Handle jobs that have not reported in the alotted time """ - self.status_pool.apply_async(self.jobStatus, (job, jobs, j_lock)) + def handleJobStatus(self, job, caveats=None): + """ + Possibly reports a job's status. + + Whether or not it actually gets reported... is not so intuitive. + """ + # This try catch will get rid of the "Pool not running" errors + # when we're forced to exit + try: + self.status_pool.apply_async(self.jobStatus, (job,caveats,)) + except ValueError: + pass - def jobStatus(self, job, jobs, j_lock): + def jobStatus(self, job, caveats): """ Instruct the TestHarness to print the status of job. This is a serial threaded operation, so as to prevent clobbering of text being printed @@ -330,43 +399,49 @@ def jobStatus(self, job, jobs, j_lock): # completion as a sanity check). state = self.getStatusPoolState() - if state or job not in self.__job_bank: + if state: return + with self.__bank_lock: + if job not in self.__job_bank: + return # Peform within a try, to allow keyboard ctrl-c try: - with j_lock: - if job.isRunning(): - # already reported this job once before + with job.getLock(): + # This job is set to force a status + force_status = job.force_report_status + + if force_status: + with self.activity_lock: + self.jobs_reported.add(job) + job.force_report_status = False + elif job.isRunning(): if job in self.jobs_reported: return # this job will be reported as 'RUNNING' - elif clock() - self.last_reported_time >= self.min_report_time: + if clock() - self.last_reported_time >= self.min_report_time: # prevent 'finished' caveat with options expecting to take lengthy amounts of time if (not self.options.sep_files - and not self.options.ok_files - and not self.options.fail_files - and not self.options.pbs + and not self.options.hpc and not self.options.heavy_tests and not self.options.valgrind_mode): job.addCaveats('FINISHED') with self.activity_lock: self.jobs_reported.add(job) - # TestHarness has not yet been inactive long enough to warrant a report else: # adjust the next report time based on delta of last report time adjusted_interval = max(1, self.min_report_time - max(1, clock() - self.last_reported_time)) job.report_timer = threading.Timer(adjusted_interval, - self.handleLongRunningJob, - (job, jobs, j_lock,)) + self.handleJobStatus, + (job,)) job.report_timer.start() return # Inform the TestHarness of job status - self.harness.handleJobStatus(job) + self.harness.handleJobStatus(job, caveats=caveats) # Reset activity clock if not job.isSilent(): @@ -376,10 +451,11 @@ def jobStatus(self, job, jobs, j_lock): self.__failures += 1 if job.isFinished(): - if job in self.__job_bank: - self.__job_bank.remove(job) - else: - raise SchedulerError('job accountability failure while working with: %s' % (job.getTestName())) + with self.__bank_lock: + if job in self.__job_bank: + self.__job_bank.remove(job) + else: + raise SchedulerError('job accountability failure while working with: %s' % (job.getTestName())) # Max failure threshold reached, begin shutdown if self.maxFailures(): @@ -392,32 +468,38 @@ def jobStatus(self, job, jobs, j_lock): except KeyboardInterrupt: self.killRemaining(keyboard=True) - def runJob(self, job, jobs, j_lock): + def runJob(self, job, jobs): """ Method the run_pool calls when an available thread becomes ready """ # Its possible, the queue is just trying to empty. Allow it to do so # with out generating overhead - if self.__error_state: + if self.getErrorState(): return try: # see if we have enough slots to start this job - if self.reserveSlots(job, j_lock): - with j_lock: + if self.reserveSlots(job): + with job.getLock(): job.setStatus(job.running) + # Setup the long running timer, if any + if self.report_long_jobs: + job.report_timer = threading.Timer(self.min_report_time, + self.handleJobStatus, + (job,)) + job.report_timer.start() + else: + job.report_timer = None + with self.activity_lock: self.__active_jobs.add(job) - timeout_timer = threading.Timer(float(job.getMaxTime()), - self.handleTimeoutJob, - (job, j_lock,)) - - job.report_timer = threading.Timer(self.min_report_time, - self.handleLongRunningJob, - (job, jobs, j_lock,)) - - job.report_timer.start() - timeout_timer.start() + if self.enforce_timeout: + timeout_timer = threading.Timer(float(job.getMaxTime()), + self.handleTimeoutJob, + (job,)) + timeout_timer.start() + else: + timeout_timer = None # We have a try here because we want to explicitly catch things like # python errors in _only_ the Job; exceptions that happen in the Tester @@ -425,20 +507,24 @@ def runJob(self, job, jobs, j_lock): try: self.run(job) # Hand execution over to derived scheduler except Exception: - with j_lock: - job.setStatus(StatusSystem().error, 'JOB EXCEPTION') - job.setOutput('Encountered an exception while running Job: %s' % (traceback.format_exc())) - timeout_timer.cancel() + with job.getLock(): + trace = traceback.format_exc() + job.setStatus(StatusSystem().error, 'JOB RUN EXCEPTION') + job.appendOutput(f'Encountered an exception while running Job:\n{trace}') + + if timeout_timer: + timeout_timer.cancel() # Recover worker count before attempting to queue more jobs with self.slot_lock: - self.slots_in_use = max(0, self.slots_in_use - job.getSlots()) + self.slots_in_use = max(0, self.slots_in_use - self.getJobSlots(job)) - # Stop the long running timer - job.report_timer.cancel() + with job.getLock(): + # Stop the long running timer + if job.report_timer: + job.report_timer.cancel() - # All done - with j_lock: + # All done job.setStatus(StatusSystem().finished) with self.activity_lock: @@ -448,12 +534,12 @@ def runJob(self, job, jobs, j_lock): else: # ...currently, place back on hold before placing it back into the queue if not job.isFinished(): - with j_lock: + with job.getLock(): job.setStatus(job.hold) sleep(.1) # Job is done (or needs to re-enter the queue) - self.queueJobs(jobs, j_lock) + self.queueJobs(jobs) except Exception: print('runWorker Exception: %s' % (traceback.format_exc())) @@ -461,3 +547,19 @@ def runJob(self, job, jobs, j_lock): except KeyboardInterrupt: self.killRemaining(keyboard=True) + + def appendResultFooter(self, stats: dict) -> str: + """ Entrypoint to add additional results to the on screen result footer """ + return None + + def appendResultFileHeader(self) -> dict: + """ Entrypoint to add entries to the result file header """ + return {} + + def appendResultFileJob(self, job) -> dict: + """ Entrypoint to add entries to the result file for a job """ + return {} + + def appendStats(self) -> dict: + """ Entrypoint to add entries to the harness statistics """ + return {} diff --git a/python/TestHarness/schedulers/hpc_run.py b/python/TestHarness/schedulers/hpc_run.py new file mode 100755 index 000000000000..025a70ec8ce4 --- /dev/null +++ b/python/TestHarness/schedulers/hpc_run.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +#* This file is part of the MOOSE framework +#* https://www.mooseframework.org +#* +#* All rights reserved, see COPYRIGHT for full restrictions +#* https://github.com/idaholab/moose/blob/master/COPYRIGHT +#* +#* Licensed under LGPL 2.1, please see LICENSE for details +#* https://www.gnu.org/licenses/lgpl-2.1.html + +import os, shlex, subprocess, sys, urllib.parse + +# This is a helper script for running an external process in HPC _not_ +# within a shell, which allows for continuity of running things on HPC +# just like we run them within the SubprocessRunner. It allows us to not +# deal with escaping all kinds of crud as we execute it within a shell. +# It takes a single argument, which is the url encoded thing to run, +# decodes it, and runs it. +if __name__ == '__main__': + if len(sys.argv) != 2: + print('ERROR: Expected single argument of the encoded command to run') + sys.exit(1) + + # The command should be encoded on other end with urrllib.parse.quote + encoded_command = sys.argv[1] + command = shlex.split(urllib.parse.unquote(encoded_command)) + + # Try to only print this on rank 0 + rank = os.environ.get('PMI_RANK') # mpich + if rank is None: + rank = os.environ.get('OMPI_COMM_WORLD_RANK') # openmpi + if rank == '0' or rank is None: + print('Running decoded command:', ' '.join(command), flush=True) + + # Run the thing; close_fds=False needed for MPI + process = subprocess.run(command, + stdout=sys.stdout, + stderr=sys.stderr, + close_fds=False) + # This is a wrapper so just exit with the code of whatever we ran + sys.exit(process.returncode) diff --git a/python/TestHarness/schedulers/hpc_source b/python/TestHarness/schedulers/hpc_source new file mode 100644 index 000000000000..abb65f042821 --- /dev/null +++ b/python/TestHarness/schedulers/hpc_source @@ -0,0 +1,2 @@ +export MOOSE_DEV_CONTAINER_MINIMAL_BINDPATH=1 +module load use.moose moose-dev-container-openmpi diff --git a/python/TestHarness/schedulers/hpc_template b/python/TestHarness/schedulers/hpc_template new file mode 100644 index 000000000000..730ff8e66184 --- /dev/null +++ b/python/TestHarness/schedulers/hpc_template @@ -0,0 +1,142 @@ +#!/bin/bash +{%- if SCHEDULER_NAME == "pbs" %} +#PBS -N {{ NAME }} +#PBS -l select={{ NUM_PROCS }}:mpiprocs=1:ncpus={{ NUM_THREADS }} +#PBS -l walltime={{ WALLTIME }} +#PBS -P {{ PROJECT }} +{%- if HOLD is defined %} +#PBS -h +{%- endif %} +{%- if QUEUE is defined %} +#PBS -q {{ QUEUE }} +{%- endif %} +#PBS -j oe +#PBS -o {{ OUTPUT }} +#PBS -l place={{ PLACE }} +{%- elif SCHEDULER_NAME == "slurm" %} +#SBATCH --job-name={{ NAME }} +#SBATCH --ntasks={{ NUM_PROCS }} +#SBATCH --cpus-per-task={{ NUM_THREADS }} +#SBATCH --time={{ WALLTIME }} +#SBATCH --wckey={{ PROJECT }} +#SBATCH --output={{ OUTPUT }} +{%- if HOLD is defined %} +#SBATCH --hold +{%- endif %} +{%- if PLACE == "scatter" %} +#SBATCH --ntasks-per-node=1 +{%- endif %} +{%- endif %} + +# Exit on failure +set -e + +{%- if SOURCE_FILE is defined %} +# Loaded from {{ SOURCE_FILE }} +{{ SOURCE_CONTENTS }} +{%- endif %} + +# Add MOOSE's python path for python scripts +export PYTHONPATH={{ MOOSE_PYTHONPATH }}:${PYTHONPATH} + +# Print a useful header +echo "TestHarness {{ SCHEDULER_NAME }} job on $(hostname) in job ${{ JOB_ID_VARIABLE }}" +echo "Time: $(date)" +echo 'Test: {{ TEST_SPEC }}:{{ TEST_NAME }}' +echo 'Directory: {{ CWD }}' +echo 'Submitted hostname: {{ SUBMITTED_HOSTNAME }}' +echo 'Submission script: {{ SUBMISSION_SCRIPT }}' +echo 'Output: {{ OUTPUT }}' +echo 'Result: {{ RESULT }}' +module list + +echo "################################################################################" +echo "Beginning TestHarness {{ SCHEDULER_NAME }} test execution" + +# Move into the test directory +cd {{ CWD }} + +# Make a temp file to store the time +time_output=$(mktemp) + +# Make a temporary directory that's shared for this job. Because slurm doesn't +# make a tmpdir by default, this gets us a consistent tmpdir across all schedulers +{%- if SCHEDULER_NAME == "pbs" %} +NUM_NODES=$(sort $PBS_NODEFILE | uniq -c | wc -l) +{%- else %} +NUM_NODES=${SLURM_JOB_NUM_NODES} +{%- endif %} +JOB_TMPDIR="$(mktemp -d -u --suffix _${{ JOB_ID_VARIABLE }})" +# This ONLY works for openmpi right now; -N needs to be -ppn for mpich +mpiexec -n ${NUM_NODES} -N 1 mkdir ${JOB_TMPDIR} +export TMPDIR="${JOB_TMPDIR}" + +# If we're using a run with APPTAINER_SHARENS, we really don't want to use /home +# as a location for storing instance state as it can be very fickle. So, use a +# temprorary one +if [ "$APPTAINER_SHARENS" == "1" ]; then + export APPTAINER_CONFIGDIR="${JOB_TMPDIR}/.apptainer" +fi + +# Don't exit on failure: need to capture the actual run's return code +set +e +# Run the command, wrapped in time so that we can capture the real runtime +# We use which here to make sure we don't get the bash function 'time' +$(which time) -f %e -o ${time_output} {{ COMMAND }} +# ...and capture the return code cause we're not done yet +return_code=$? +# Exit on failure +set -e + +# We will read this output later on to try to capture the return code +# in the event that PBS doesn't get it to us correctly +echo "################################################################################" +{%- if USING_APPTAINER is defined %} +# We have a special case with exit codes when we run within apptainer. Sometimes when +# codes are received when running in a container, the container will return with exit code +# 128 + . Capture that here because we don't wanna exit code a code > 128, which +# are special exit codes for the schedulers. +if ((return_code > 128)); then + new_return_code=$((return_code - 128)) + echo "Apptainer exited with code $return_code, using $new_return_code instead" + return_code=$new_return_code +fi +{%- endif %} +# Load the execution time; we use a tail here because the process will +# include a comment about a non-zero status first if the exit code is nonzero +walltime=$(tail -1 ${time_output}) +rm ${time_output} +# Print the exit footer +echo "Completed TestHarness {{ SCHEDULER_NAME }} test execution; exit code = $return_code, walltime = $walltime sec" + +# Build the result file +touch {{ RESULT }} +echo "exit_code: $return_code" >> {{ RESULT }} +echo "walltime: $walltime" >> {{ RESULT }} + +# Append a terminator to all of the output files for file syncing across NFS +ADDITIONAL_OUTPUT_FILES=({{ ADDITIONAL_OUTPUT_FILES }}) +for file in ${ADDITIONAL_OUTPUT_FILES[@]}; do + if [ ! -e "$file" ]; then + echo "Failed to find output $file" + continue + fi + + printf "{{ ENDING_COMMENT }}" >> $file; + if [ $? != 0 ]; then + echo "Failed to finalize output $file" + fi +done + +# Append a recognizable string at the end of the output. We look +# for this string when parsing the output so that we can be sure +# that we have obtained all of the output +printf "{{ ENDING_COMMENT }}" + +# Clean up the job tempdir +set +e +mpiexec -n ${NUM_NODES} -N 1 rm -rf ${JOB_TMPDIR} + +# Exit with the real return code from the job that we ran +exit $return_code + diff --git a/python/TestHarness/schedulers/pbs_template b/python/TestHarness/schedulers/pbs_template deleted file mode 100644 index f3753c7ad741..000000000000 --- a/python/TestHarness/schedulers/pbs_template +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -#PBS -N -#PBS -l select=:ncpus= -#PBS -l walltime= - - -#PBS -j oe -#PBS -o -#PBS -l place=free - -JOB_NUM=${PBS_JOBID%\.*} - -export MV2_ENABLE_AFFINITY=0 - -cd - diff --git a/python/TestHarness/testers/AnalyzeJacobian.py b/python/TestHarness/testers/AnalyzeJacobian.py index 38a4585a159d..6e833b71d121 100644 --- a/python/TestHarness/testers/AnalyzeJacobian.py +++ b/python/TestHarness/testers/AnalyzeJacobian.py @@ -28,7 +28,7 @@ def validParams(): def __init__(self, name, params): FileTester.__init__(self, name, params) - def getOutputFiles(self): + def getOutputFiles(self, options): # analizejacobian.py outputs files prefixed with the input file name return [self.specs['input']] @@ -71,20 +71,35 @@ def getCommand(self, options): return command - def processResults(self, moose_dir, options, output): + def processResults(self, moose_dir, options, exit_code, runner_output): reason = '' specs = self.specs if specs.isValid('expect_out'): - out_ok = util.checkOutputForPattern(output, specs['expect_out']) - if (out_ok and self.exit_code != 0): + out_ok = util.checkOutputForPattern(runner_output, specs['expect_out']) + if (out_ok and exit_code != 0): reason = 'OUT FOUND BUT CRASH' elif (not out_ok): reason = 'NO EXPECTED OUT' if reason == '': - if self.exit_code != 0 : + if exit_code != 0: reason = 'CRASH' if reason != '': self.setStatus(self.fail, reason) - return output + return '' + + def checkRunnable(self, options): + # We cannot rely on an external script running things within HPC + if options.hpc: + self.addCaveats('hpc unsupported') + self.setStatus(self.skip) + return False + + # This doesn't pass valgrind arguments + if options.valgrind_mode: + self.addCaveats('valgrind=false') + self.setStatus(self.skip) + return False + + return FileTester.checkRunnable(self, options) diff --git a/python/TestHarness/testers/CSVDiff.py b/python/TestHarness/testers/CSVDiff.py index 48141a83e55d..2e34f587a43f 100644 --- a/python/TestHarness/testers/CSVDiff.py +++ b/python/TestHarness/testers/CSVDiff.py @@ -29,7 +29,7 @@ def validParams(): def __init__(self, name, params): FileTester.__init__(self, name, params) - def getOutputFiles(self): + def getOutputFiles(self, options): return self.specs['csvdiff'] # Check that override parameter lists are the same length @@ -89,8 +89,8 @@ def processResultsCommand(self, moose_dir, options): return commands - def processResults(self, moose_dir, options, output): - FileTester.processResults(self, moose_dir, options, output) + def processResults(self, moose_dir, options, exit_code, runner_output): + output = super().processResults(moose_dir, options, exit_code, runner_output) if self.isFail() or self.specs['skip_checks']: return output diff --git a/python/TestHarness/testers/CSVValidationTester.py b/python/TestHarness/testers/CSVValidationTester.py index a4dcc2ebcc13..b3f8e972865c 100644 --- a/python/TestHarness/testers/CSVValidationTester.py +++ b/python/TestHarness/testers/CSVValidationTester.py @@ -108,8 +108,8 @@ def __init__(self, name, params): # formatting self.file_name_len = 40 - def processResults(self, moose_dir, options, output): - FileTester.processResults(self, moose_dir, options, output) + def processResults(self, moose_dir, options, exit_code, runner_output): + output = super().processResults(moose_dir, options, exit_code, runner_output) if self.isFail() or self.specs['skip_checks']: return output @@ -118,7 +118,6 @@ def processResults(self, moose_dir, options, output): if options.scaling and self.specs['scale_refine']: return output - output = "" # Make sure that all of the CSVDiff files are actually available for file in self.specs['csvdiff']: if not os.path.exists(os.path.join(self.getTestDir(), self.specs['gold_dir'], file)): diff --git a/python/TestHarness/testers/CheckFiles.py b/python/TestHarness/testers/CheckFiles.py index 78d0a04306c3..20a11468d642 100644 --- a/python/TestHarness/testers/CheckFiles.py +++ b/python/TestHarness/testers/CheckFiles.py @@ -28,11 +28,11 @@ def __init__(self, name, params): if not (params.isValid('check_files') or params.isValid('check_not_exists')): raise Exception('Either "check_files" or "check_not_exists" must be supplied for a CheckFiles test') - def getOutputFiles(self): + def getOutputFiles(self, options): return self.specs['check_files'] + self.specs['check_not_exists'] - def processResults(self, moose_dir, options, output): - FileTester.processResults(self, moose_dir, options, output) + def processResults(self, moose_dir, options, exit_code, runner_output): + output = super().processResults(moose_dir, options, exit_code, runner_output) specs = self.specs @@ -75,3 +75,12 @@ def processResults(self, moose_dir, options, output): self.setStatus(self.fail, reason) return output + + def checkRunnable(self, options): + # We cannot reliably check if files do not exist with a networked file system + if options.hpc and self.specs['check_not_exists']: + self.addCaveats('hpc unsupported') + self.setStatus(self.skip) + return False + + return super().checkRunnable(options) diff --git a/python/TestHarness/testers/Exodiff.py b/python/TestHarness/testers/Exodiff.py index 8668756ee10d..9cf75dd0e214 100644 --- a/python/TestHarness/testers/Exodiff.py +++ b/python/TestHarness/testers/Exodiff.py @@ -41,7 +41,7 @@ def __init__(self, name, params): if self.specs['map'] and self.specs['partial']: raise Exception("For the Exodiff tester, you cannot specify both 'map' and 'partial' as True") - def getOutputFiles(self): + def getOutputFiles(self, options): return self.specs['exodiff'] def processResultsCommand(self, moose_dir, options): @@ -72,8 +72,8 @@ def processResultsCommand(self, moose_dir, options): return commands - def processResults(self, moose_dir, options, output): - FileTester.processResults(self, moose_dir, options, output) + def processResults(self, moose_dir, options, exit_code, runner_output): + output = super().processResults(moose_dir, options, exit_code, runner_output) if self.isFail() or self.specs['skip_checks']: return output diff --git a/python/TestHarness/testers/FileTester.py b/python/TestHarness/testers/FileTester.py index 73e1eb5eb4d8..c2ebc3176378 100644 --- a/python/TestHarness/testers/FileTester.py +++ b/python/TestHarness/testers/FileTester.py @@ -27,4 +27,4 @@ def __init__(self, name, params): def prepare(self, options): if self.specs['delete_output_before_running']: - util.deleteFilesAndFolders(self.getTestDir(), self.getOutputFiles()) + util.deleteFilesAndFolders(self.getTestDir(), self.getOutputFiles(options)) diff --git a/python/TestHarness/testers/ImageDiff.py b/python/TestHarness/testers/ImageDiff.py index 379266ec95bc..4adff301de0c 100644 --- a/python/TestHarness/testers/ImageDiff.py +++ b/python/TestHarness/testers/ImageDiff.py @@ -33,16 +33,16 @@ def __init__(self, name, params): elif 'skimage' not in self.specs['required_python_packages']: self.specs['required_python_packages'] += ' skimage' - def getOutputFiles(self): + def getOutputFiles(self, options): return self.specs['imagediff'] - def processResults(self, moose_dir, options, output): + def processResults(self, moose_dir, options, exit_code, runner_output): """ Perform image diff """ # Call base class processResults - FileTester.processResults(self, moose_dir, options, output) + output = super().processResults(moose_dir, options, exit_code, runner_output) if self.isFail(): return output diff --git a/python/TestHarness/testers/JSONDiff.py b/python/TestHarness/testers/JSONDiff.py index 68d5a72ecb68..810bdfacd547 100644 --- a/python/TestHarness/testers/JSONDiff.py +++ b/python/TestHarness/testers/JSONDiff.py @@ -8,9 +8,8 @@ #* https://www.gnu.org/licenses/lgpl-2.1.html from SchemaDiff import SchemaDiff -from TestHarness import util -class JSONDiff(SchemaDiff): +class JSONDiff(SchemaDiff): @staticmethod def validParams(): params = SchemaDiff.validParams() @@ -45,10 +44,6 @@ def __init__(self, name, params): re_entry += f"\['{key}'\]" self.exclude_regex_paths.append(re_entry) - def prepare(self, options): - if self.specs['delete_output_before_running'] == True: - util.deleteFilesAndFolders(self.getTestDir(), self.specs['jsondiff']) - def load_file(self, path1): import json with open(path1,"r") as f: diff --git a/python/TestHarness/testers/PetscJacobianTester.py b/python/TestHarness/testers/PetscJacobianTester.py index 684d8992e238..2cb7fb8fa92c 100644 --- a/python/TestHarness/testers/PetscJacobianTester.py +++ b/python/TestHarness/testers/PetscJacobianTester.py @@ -95,17 +95,19 @@ def __compare(self, value, threshold): else: return False - def processResults(self, moose_dir, options, output): + def processResults(self, moose_dir, options, exit_code, runner_output): + output = '' + if self.old_petsc: if self.specs['state'].lower() == 'user': m = re.search("Norm of matrix ratio (\S+?),? difference (\S+) \(user-defined state\)", - output, re.MULTILINE | re.DOTALL); + runner_output, re.MULTILINE | re.DOTALL) elif self.specs['state'].lower() == 'const_positive': m = re.search("Norm of matrix ratio (\S+?),? difference (\S+) \(constant state 1\.0\)", - output, re.MULTILINE | re.DOTALL); + runner_output, re.MULTILINE | re.DOTALL) elif self.specs['state'].lower() == 'const_negative': m = re.search("Norm of matrix ratio (\S+?),? difference (\S+) \(constant state -1\.0\)", - output, re.MULTILINE | re.DOTALL); + runner_output, re.MULTILINE | re.DOTALL) else: self.setStatus("state must be either 'user', const_positive', or 'const_negative'", self.bucket_fail) @@ -122,7 +124,7 @@ def processResults(self, moose_dir, options, output): else: matches = re.finditer("\|\|J - Jfd\|\|_F/\|\|J\|\|_F\s?=?\s?(\S+), \|\|J - Jfd\|\|_F\s?=?\s?(\S+)", - output, re.MULTILINE | re.DOTALL) + runner_output, re.MULTILINE | re.DOTALL) reason = 'EXPECTED OUTPUT NOT FOUND' for match in matches: diff --git a/python/TestHarness/testers/PythonUnitTest.py b/python/TestHarness/testers/PythonUnitTest.py index 1a77cd6e9254..4145d52a140d 100644 --- a/python/TestHarness/testers/PythonUnitTest.py +++ b/python/TestHarness/testers/PythonUnitTest.py @@ -46,4 +46,27 @@ def getCommand(self, options): else: cmd = "python3 -m unittest" + use_buffer + "-v " + test_case - return cmd + ' '.join(self.specs['cli_args']) + return cmd + ' '.join(self.specs['cli_args']) + + def checkRunnable(self, options): + # Don't run unit tests on HPC. These tests commonly involve running + # an appliacation within a black box script, which we cannot control + # very well within the HPC environment + if options.hpc: + self.addCaveats('hpc unsupported') + self.setStatus(self.skip) + return False + + return super().checkRunnable(options) + + def getProcs(self, options): + procs = super().getProcs(options) + # If we start within a script within apptainer and then call mpiexec on HPC, + # it will not work because the mpiexec call needs to be outside of the apptainer + # call. So, limit these tests to 1 proc + if options.hpc and \ + os.environ.get('APPTAINER_CONTAINER') and \ + int(self.specs['min_parallel']) == 1 and procs != 1: + self.addCaveats('hpc apptainer max_cpus=1') + return 1 + return procs diff --git a/python/TestHarness/testers/RunApp.py b/python/TestHarness/testers/RunApp.py index 25c325eb0d93..b3c8d41f78e0 100644 --- a/python/TestHarness/testers/RunApp.py +++ b/python/TestHarness/testers/RunApp.py @@ -79,7 +79,7 @@ def __init__(self, name, params): def getInputFile(self): if self.specs.isValid('input'): - return self.specs['input'].strip() + return os.path.join(self.getTestDir(), self.specs['input'].strip()) else: return None # Not all testers that inherit from RunApp have an input file @@ -119,9 +119,26 @@ def checkRunnable(self, options): self.setStatus(self.skip) return False + if options.hpc and self.specs.isValid('command_proxy') and os.environ.get('APPTAINER_CONTAINER') is not None: + self.addCaveats('hpc unsupported') + self.setStatus(self.skip) + return False + + # Finalizing output using the current method in the submission script from the rank 0 process isn't + # really a good idea when output might exist on a different node. We could make that finalization + # more complex, but there isn't a need at the moment. + if options.hpc and self.specs['redirect_output'] == True and int(self.specs['min_parallel']) > 1: + self.addCaveats('hpc min_cpus=1') + self.setStatus(self.skip) + return False + return True def getThreads(self, options): + # This disables additional arguments + if self.specs['no_additional_cli_args']: + return 1 + #Set number of threads to be used lower bound nthreads = max(options.nthreads, int(self.specs['min_threads'])) #Set number of threads to be used upper bound @@ -135,16 +152,29 @@ def getThreads(self, options): return nthreads def getProcs(self, options): + # This disables additional arguments + if self.specs['no_additional_cli_args']: + return 1 + if options.parallel == None: default_ncpus = 1 else: default_ncpus = options.parallel + min_parallel = int(self.specs['min_parallel']) + # Raise the floor - ncpus = max(default_ncpus, int(self.specs['min_parallel'])) + ncpus = max(default_ncpus, min_parallel) # Lower the ceiling ncpus = min(ncpus, int(self.specs['max_parallel'])) + # Finalizing output using the current method in the submission script from the rank 0 process isn't + # really a good idea when output might exist on a different node. We could make that finalization + # more complex, but there isn't a need at the moment. + if options.hpc and self.specs['redirect_output'] == True and min_parallel == 1 and ncpus > 1: + self.addCaveats('hpc min_cpus=1') + return 1 + if ncpus > default_ncpus: self.addCaveats('min_cpus=' + str(ncpus)) elif ncpus < default_ncpus: @@ -161,14 +191,19 @@ def getCommand(self, options): # Check for built application if shutil.which(specs['executable']) is None: - self.setStatus(self.fail, 'Application not found') - return '' + self.setStatus(self.error, 'APPLICATION NOT FOUND') # If no_additional_cli_args is set to True, return early with a simplified command line ignoring # all other TestHarness supplied options. if specs['no_additional_cli_args']: # TODO: Do error checking for TestHarness options that will be silently ignored - return os.path.join(specs['test_dir'], specs['executable']) + ' ' + ' '.join(specs['cli_args']) + cmd = os.path.join(specs['test_dir'], specs['executable']) + ' ' + ' '.join(specs['cli_args']) + + # Need to run mpiexec with containerized openmpi + if options.hpc and self.hasOpenMPI(): + cmd = f'mpiexec -n 1 {cmd}' + + return cmd # Create the additional command line arguments list cli_args = list(specs['cli_args']) @@ -225,7 +260,8 @@ def getCommand(self, options): elif nthreads > 1: command = command + ' --n-threads=' + str(nthreads) - if self.force_mpi or options.parallel or ncpus > 1: + # Force mpi, more than 1 core, or containerized openmpi (requires mpiexec serial) + if self.force_mpi or ncpus > 1 or (options.hpc and self.hasOpenMPI()): command = f'{self.mpi_command} -n {ncpus} {command}' # Arbitrary proxy command, but keep track of the command so that someone could use it later @@ -235,7 +271,7 @@ def getCommand(self, options): return command - def testFileOutput(self, moose_dir, options, output): + def testFileOutput(self, moose_dir, options, runner_output): """ Set a failure status for expressions found in output """ reason = '' errors = '' @@ -251,16 +287,13 @@ def testFileOutput(self, moose_dir, options, output): custom_module = importlib.util.module_from_spec(custom_mod_spec) sys.modules['custom_module'] = custom_module custom_mod_spec.loader.exec_module(custom_module) - if custom_module.custom_evaluation(output): + if custom_module.custom_evaluation(runner_output): return errors else: errors += "#"*80 + "\n\n" + "Custom evaluation failed.\n" self.setStatus(self.fail, "CUSTOM EVAL FAILED") return errors - - - params_and_msgs = {'expect_err': {'error_missing': True, 'modes': ['ALL'], @@ -287,10 +320,10 @@ def testFileOutput(self, moose_dir, options, output): if specs.isValid(param) and (options.method in attr['modes'] or attr['modes'] == ['ALL']): match_type = "" if specs['match_literal']: - have_expected_out = util.checkOutputForLiteral(output, specs[param]) + have_expected_out = util.checkOutputForLiteral(runner_output, specs[param]) match_type = 'literal' else: - have_expected_out = util.checkOutputForPattern(output, specs[param]) + have_expected_out = util.checkOutputForPattern(runner_output, specs[param]) match_type = 'pattern' # Exclusive OR test @@ -304,7 +337,7 @@ def testFileOutput(self, moose_dir, options, output): return errors - def testExitCodes(self, moose_dir, options, output): + def testExitCodes(self, moose_dir, options, exit_code, runner_output): # Don't do anything if we already have a status set reason = '' if self.isNoStatus(): @@ -312,25 +345,25 @@ def testExitCodes(self, moose_dir, options, output): # We won't pay attention to the ERROR strings if EXPECT_ERR is set (from the derived class) # since a message to standard error might actually be a real error. This case should be handled # in the derived class. - if options.valgrind_mode == '' and not specs.isValid('expect_err') and len( [x for x in filter( lambda x: x in output, specs['errors'] )] ) > 0: + if options.valgrind_mode == '' and not specs.isValid('expect_err') and len( [x for x in filter( lambda x: x in runner_output, specs['errors'] )] ) > 0: reason = 'ERRMSG' - elif self.exit_code == 0 and specs['should_crash'] == True: + elif exit_code == 0 and specs['should_crash'] == True: reason = 'NO CRASH' - elif self.exit_code != 0 and specs['should_crash'] == False: + elif exit_code != 0 and specs['should_crash'] == False and self.shouldExecute(): # Let's look at the error code to see if we can perhaps further split this out later with a post exam reason = 'CRASH' # Valgrind runs - elif self.exit_code == 0 and self.shouldExecute() and options.valgrind_mode != '' and 'ERROR SUMMARY: 0 errors' not in output: + elif exit_code == 0 and self.shouldExecute() and options.valgrind_mode != '' and 'ERROR SUMMARY: 0 errors' not in runner_output: reason = 'MEMORY ERROR' if reason != '': self.setStatus(self.fail, str(reason)) - return "\n\nExit Code: " + str(self.exit_code) + return "\n\nExit Code: " + str(exit_code) # Return anything extra here that we want to tack onto the Output for when it gets printed later return '' - def processResults(self, moose_dir, options, output): + def processResults(self, moose_dir, options, exit_code, runner_output): """ Wrapper method for testFileOutput. @@ -344,7 +377,21 @@ def processResults(self, moose_dir, options, output): # TODO: because RunParallel is now setting every successful status message, refactor testFileOutput and processResults. """ - output += self.testFileOutput(moose_dir, options, output) - output += self.testExitCodes(moose_dir, options, output) + output = '' + output += self.testFileOutput(moose_dir, options, runner_output) + output += self.testExitCodes(moose_dir, options, exit_code, runner_output) return output + + def mustOutputExist(self, exit_code): + if self.specs['should_crash']: + return exit_code != 0 + return exit_code == 0 + + def needFullOutput(self, options): + # We need the full output when we're trying to read from said output + params = ['expect_err', 'expect_assert', 'expect_out', 'absent_out'] + for param in params: + if self.specs.isValid(param): + return True + return super().needFullOutput(options) diff --git a/python/TestHarness/testers/RunCommand.py b/python/TestHarness/testers/RunCommand.py index 74c1d879c05e..31fb1df45fe7 100644 --- a/python/TestHarness/testers/RunCommand.py +++ b/python/TestHarness/testers/RunCommand.py @@ -26,10 +26,10 @@ def getCommand(self, options): # Create the command line string to run return self.command - def processResults(self, moose_dir, options, output): - if self.exit_code == 77 : + def processResults(self, moose_dir, options, exit_code, runner_output): + if exit_code == 77 : self.setStatus(self.skip) - elif self.exit_code != 0 : - self.setStatus(self.fail, 'CODE %d' % self.exit_code) + elif exit_code != 0: + self.setStatus(self.fail, 'CODE %d' % exit_code) - return output + return '' diff --git a/python/TestHarness/testers/RunException.py b/python/TestHarness/testers/RunException.py index bf6effe23c24..6a9791036f08 100644 --- a/python/TestHarness/testers/RunException.py +++ b/python/TestHarness/testers/RunException.py @@ -35,23 +35,20 @@ def checkRunnable(self, options): self.addCaveats('type=RunException') self.setStatus(self.skip) return False + # We seem to have issues with --redirect-output causing + # "Inappropriate ioctl for device (25)" errors, so if this test + # requires more procs, we can't run it + if options.hpc and int(self.specs['min_parallel'] > 1): + self.addCaveats('hpc max_cpus=1') + return False return RunApp.checkRunnable(self, options) def prepare(self, options): - if self.getProcs(options) > 1: - file_paths = [] - for processor_id in range(self.getProcs(options)): - file_paths.append(self.name() + '.processor.{}'.format(processor_id)) - util.deleteFilesAndFolders(self.getTestDir(), file_paths, False) - - def processResults(self, moose_dir, options, output): - # Exceptions are written to stderr, which can be interleaved so we normally redirect these - # separate files. Here we must gather those file outputs before processing if self.hasRedirectedOutput(options): - redirected_output = util.getOutputFromFiles(self, options) - output += redirected_output - - output += self.testFileOutput(moose_dir, options, output) - output += self.testExitCodes(moose_dir, options, output) + files = self.getRedirectedOutputFiles(options) + util.deleteFilesAndFolders(self.getTestDir(), files, False) - return output + def getOutputFiles(self, options): + if self.hasRedirectedOutput(options): + return self.getRedirectedOutputFiles(options) + return [] diff --git a/python/TestHarness/testers/SchemaDiff.py b/python/TestHarness/testers/SchemaDiff.py index 6e7ba4eef30a..0b5fc7c9be83 100644 --- a/python/TestHarness/testers/SchemaDiff.py +++ b/python/TestHarness/testers/SchemaDiff.py @@ -36,13 +36,16 @@ def __init__(self, name, params): # So that derived classes can internally pass skip regex paths self.exclude_regex_paths = [] + def getOutputFiles(self, options): + return self.specs['schemadiff'] + def prepare(self, options): if self.specs['delete_output_before_running'] == True: - util.deleteFilesAndFolders(self.getTestDir(), self.specs['schemadiff']) + util.deleteFilesAndFolders(self.getTestDir(), self.getOutputFiles(options)) + + def processResults(self, moose_dir, options, exit_code, runner_output): + output = super().processResults(moose_dir, options, exit_code, runner_output) - def processResults(self, moose_dir, options, output): - output += self.testFileOutput(moose_dir, options, output) - output += self.testExitCodes(moose_dir, options, output) specs = self.specs if self.isFail() or specs['skip_checks']: diff --git a/python/TestHarness/testers/SignalTester.py b/python/TestHarness/testers/SignalTester.py index e26524f17713..83825b2fca14 100644 --- a/python/TestHarness/testers/SignalTester.py +++ b/python/TestHarness/testers/SignalTester.py @@ -38,43 +38,15 @@ def __init__(self, name, params): f"a supported signal type. Currently supported signal types are:\n{', '.join(list(valid_signals.keys()))}") raise e - def send_signal(self): - """Function used to send a signal to the program automatically for testing purposes.""" - - # Create a while loop that checks if the stdout buffer has any data in it, and then sends the signal once - # it knows that the moose_test binary is actually doing something. - - # process.poll() returns the process's exit code if it has completed, and None if it is still running. - # This acts as a safety precaution against an infinite loop -- this will always close. - while self.process.poll() is None: - - # tell() gives the current position in the file. If it is greater than zero, the binary - # has started running and writing output. - # if the output is blank, the moose_test binary hasn't actually started doing anything yet. - # if so, sleep briefly and check again. - if not self.outfile.tell(): - time.sleep(0.05) - - # if the output isn't blank, then we finally send the signal and exit the loop - else: - try: - os.kill(self.process.pid, self.signal) - break - except ProcessLookupError as e: - print("Unable to send signal to process. Has it already terminated?") - raise e - - def runCommand(self, timer, options): - """ - Helper method for running external (sub)processes as part of the tester's execution. This - uses the tester's getCommand and getTestDir methods to run a subprocess. The timer must - be the same timer passed to the run method. Results from running the subprocess is stored - in the tester's output and exit_code fields. - """ - - exit_code = super().spawnSubprocessFromOptions(timer, options) - if exit_code: # Something went wrong - return - - self.send_signal() - super().finishAndCleanupSubprocess(timer) + def checkRunnable(self, options): + # We could probably configure sending signals via pbs and slurm + # but for now that's a no + if options.hpc: + self.addCaveats('hpc unsupported') + self.setStatus(self.skip) + return False + + return super().checkRunnable(options) + + def postSpawn(self, runner): + runner.sendSignal(self.signal) diff --git a/python/TestHarness/testers/Tester.py b/python/TestHarness/testers/Tester.py index 07b018d5ff84..3698b5ca4c0f 100644 --- a/python/TestHarness/testers/Tester.py +++ b/python/TestHarness/testers/Tester.py @@ -7,17 +7,14 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html -import platform, re, os, sys, pkgutil, shutil, shlex +import re, os, sys, shutil import mooseutils -from TestHarness import util +from TestHarness import OutputInterface, util from TestHarness.StatusSystem import StatusSystem from FactorySystem.MooseObject import MooseObject -from tempfile import SpooledTemporaryFile, TemporaryDirectory from pathlib import Path -import subprocess -from signal import SIGTERM -class Tester(MooseObject): +class Tester(MooseObject, OutputInterface): """ Base class from which all tester objects are instanced. """ @@ -26,10 +23,10 @@ def validParams(): params = MooseObject.validParams() # Common Options - params.addRequiredParam('type', "The type of test of Tester to create for this test.") - params.addParam('max_time', int(os.getenv('MOOSE_TEST_MAX_TIME', 300)), "The maximum in seconds that the test will be allowed to run.") - params.addParam('skip', "Provide a reason this test will be skipped.") - params.addParam('deleted', "Tests that only show up when using the '-e' option (Permanently skipped or not implemented).") + params.addRequiredParam('type', "The type of test of Tester to create for this test.") + params.addParam('max_time', Tester.getDefaultMaxTime(), "The maximum in seconds that the test will be allowed to run.") + params.addParam('skip', "Provide a reason this test will be skipped.") + params.addParam('deleted', "Tests that only show up when using the '-e' option (Permanently skipped or not implemented).") params.addParam('unique_test_id', "The unique hash given to a test") params.addParam('heavy', False, "Set to True if this test should only be run when the '--heavy' option is used.") @@ -115,22 +112,20 @@ def validParams(): params.addParam("deprecated", False, "When True the test is no longer considered part SQA process and as such does not include the need for a requirement definition.") params.addParam("collections", [], "A means for defining a collection of tests for SQA process.") params.addParam("classification", 'functional', "A means for defining a requirement classification for SQA process.") - return params - def __del__(self): - # Do any cleaning that we can (removes the temp dir for now if it exists) - self.cleanup() + params.addParam('hpc', True, 'Set to false to not run with HPC schedulers (PBS and slurm)') + + return params # This is what will be checked for when we look for valid testers IS_TESTER = True def __init__(self, name, params): MooseObject.__init__(self, name, params) + OutputInterface.__init__(self) + self.specs = params - self.outfile = None - self.errfile = None self.joined_out = '' - self.exit_code = 0 self.process = None self.tags = params['tags'] self.__caveats = set([]) @@ -149,8 +144,11 @@ def __init__(self, name, params): if self.specs["allow_test_objects"]: self.specs["cli_args"].append("--allow-test-objects") - ### Enumerate the tester statuses we want to use - self.test_status = StatusSystem() + # The Tester status; here we do not use locks because we need to + # do deep copy operations of a Tester object, and thread locks + # cannot be deep copied. + self.test_status = StatusSystem(locking=False) + # Enumerate the tester statuses we want to use self.no_status = self.test_status.no_status self.queued = self.test_status.queued self.skip = self.test_status.skip @@ -159,34 +157,14 @@ def __init__(self, name, params): self.fail = self.test_status.fail self.diff = self.test_status.diff self.deleted = self.test_status.deleted + self.error = self.test_status.error self.__failed_statuses = self.test_status.getFailingStatuses() self.__skipped_statuses = [self.skip, self.silent] - # A temp directory for this Tester, if requested - self.tmp_dir = None - - def getTempDirectory(self): - """ - Gets a shared temp directory that will be cleaned up for this Tester - """ - if self.tmp_dir is None: - self.tmp_dir = TemporaryDirectory(prefix='tester_') - return self.tmp_dir - - def cleanup(self): - """ - Entry point for doing any cleaning if necessary. - - Currently just cleans up the temp directory - """ - if self.tmp_dir is not None: - # Don't let this fail - try: - self.tmp_dir.cleanup() - except: - pass - self.tmp_dir = None + # The command that we actually ended up running; this may change + # depending on the runner which might inject something + self.command_ran = None def getStatus(self): return self.test_status.getStatus() @@ -198,20 +176,51 @@ def setStatus(self, status, message=''): def createStatus(self): return self.test_status.createStatus() - # Return a tuple (status, message, caveats) for this tester as found - # in the .previous_test_results.json file (or supplied json object) - def previousTesterStatus(self, options, previous_storage=None): - if not previous_storage: - previous_storage = options.results_storage + def getResultsEntry(self, options, create, graceful=False): + """ Get the entry in the results storage for this tester """ + tests = options.results_storage['tests'] + + test_dir = self.getTestDir() + test_dir_entry = tests.get(test_dir) + if not test_dir_entry: + if not create: + if graceful: + return None, None + raise Exception(f'Test folder {test_dir} not in results') + tests[test_dir] = {} + test_dir_entry = tests[test_dir] + + test_name = self.getTestName() + test_name_entry = test_dir_entry.get(test_name) + if not test_name_entry: + if not create: + if graceful: + return test_dir_entry, None + raise Exception(f'Test {test_dir}/{test_name} not in results') + test_dir_entry[test_name] = {} + return test_dir_entry, test_dir_entry.get(test_name) - status_exists = previous_storage.get(self.getTestDir(), {}).get(self.getTestName(), None) + # Return a tuple (status, message, caveats) for this tester as found + # in the previous results + def previousTesterStatus(self, options): + test_dir_entry, test_entry = self.getResultsEntry(options, False, True) status = (self.test_status.createStatus(), '', '') - if status_exists: - status = (self.test_status.createStatus(str(status_exists['STATUS'])), - str(status_exists['STATUS_MESSAGE']), - status_exists['CAVEATS']) + if test_entry: + status = (self.test_status.createStatus(str(test_entry['status'])), + str(test_entry['status_message']), + test_entry['caveats']) return (status) + def getResults(self, options) -> dict: + """Get the results dict for this Tester""" + output_files = [] + for file in self.getOutputFiles(options): + output_files.append(os.path.join(self.getTestDir(), file)) + return {'name': self.__class__.__name__, + 'command': self.getCommand(options), + 'input_file': self.getInputFile(), + 'output_files': output_files} + def getStatusMessage(self): return self.__tester_message @@ -232,11 +241,26 @@ def isDiff(self): return self.getStatus() == self.diff def isDeleted(self): return self.getStatus() == self.deleted + def isError(self): + return self.getStatus() == self.error def getTestName(self): """ return test name """ return self.specs['test_name'] + def getTestNameShort(self): + """ return test short name (not including the path) """ + return self.specs['test_name_short'] + + def appendTestName(self, value): + """ + Appends a value to the test name. + + Used when creating duplicate Testers for recover tests. + """ + self.specs['test_name'] += value + self.specs['test_name_short'] += value + def getPrereqs(self): """ return list of prerequisite tests this test depends on """ return self.specs['prereq'] @@ -255,6 +279,9 @@ def getTestDir(self): return os.path.join(self.specs['test_dir'], self.specs['working_directory']) return self.specs['test_dir'] + def getSpecFile(self): + return os.path.join(self.specs['test_dir'], self.specs['spec_file']) + def getMinReportTime(self): """ return minimum time elapse before reporting a 'long running' status """ return self.specs['min_reported_time'] @@ -263,6 +290,19 @@ def getMaxTime(self): """ return maximum time elapse before reporting a 'timeout' status """ return float(self.specs['max_time']) + def setMaxTime(self, value): + """ + Sets the max time for the job + """ + self.specs['max_time'] = float(value) + + @staticmethod + def getDefaultMaxTime(): + """ + Gets the default max run time + """ + return int(os.getenv('MOOSE_TEST_MAX_TIME', 300)) + def getUniqueTestID(self): """ return unique hash for test """ return self.specs['unique_test_id'] @@ -281,23 +321,19 @@ def getInputFileContents(self): """ return the contents of the input file applicable to this Tester """ return None - def getOutputFiles(self): + def getOutputFiles(self, options): """ return the output files if applicable to this Tester """ return [] - def getOutput(self): - """ Return the contents of stdout and stderr """ - return self.joined_out - def getCheckInput(self): return self.check_input def setValgrindMode(self, mode): """ Increase the alloted time for tests when running with the valgrind option """ if mode == 'NORMAL': - self.specs['max_time'] = float(self.specs['max_time']) * 2 + self.setMaxTime(self.getMaxTime() * 2) elif mode == 'HEAVY': - self.specs['max_time'] = float(self.specs['max_time']) * 6 + self.setMaxTime(self.getMaxTime() * 6) def checkRunnable(self, options): """ @@ -335,10 +371,6 @@ def getSlots(self, options): """ return number of slots to use for this tester """ return self.getThreads(options) * self.getProcs(options) - def getCommand(self, options): - """ return the executable command that will be executed by the tester """ - return '' - def hasOpenMPI(self): """ return whether we have openmpi for execution @@ -359,139 +391,46 @@ def hasOpenMPI(self): return False return Path(which_mpiexec).parent.absolute() == Path(which_ompi_info).parent.absolute() - def spawnSubprocessFromOptions(self, timer, options): - """ - Spawns a subprocess based on given options, sets output and error files, - and starts timer. + def getCommand(self, options): """ - cmd = self.getCommand(options) - - use_shell = self.specs["use_shell"] - - if not use_shell: - # Split command into list of args to be passed to Popen - cmd = shlex.split(cmd) - - cwd = self.getTestDir() - - # Verify that the working directory is available right before we execute. - if not os.path.exists(cwd): - # Timers must be used since they are directly indexed in the Job class - timer.start() - self.setStatus(self.fail, 'WORKING DIRECTORY NOT FOUND') - timer.stop() - return 1 - - self.process = None - try: - f = SpooledTemporaryFile(max_size=1000000) # 1M character buffer - e = SpooledTemporaryFile(max_size=100000) # 100K character buffer - - popen_args = [cmd] - popen_kwargs = {'stdout': f, - 'stderr': e, - 'close_fds': False, - 'shell': use_shell, - 'cwd': cwd} - # On Windows, there is an issue with path translation when the command - # is passed in as a list. - if platform.system() == "Windows": - popen_kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP - else: - popen_kwargs['preexec_fn'] = os.setsid - - # Special logic for openmpi runs - if self.hasOpenMPI(): - popen_env = os.environ.copy() - - # Don't clobber state - popen_env['OMPI_MCA_orte_tmpdir_base'] = self.getTempDirectory().name - # Allow oversubscription for hosts that don't have a hostfile - popen_env['PRTE_MCA_rmaps_default_mapping_policy'] = ':oversubscribe' - - popen_kwargs['env'] = popen_env + Return the command that the Tester wants ran - process = subprocess.Popen(*popen_args, **popen_kwargs) - except: - print("Error in launching a new task", cmd) - raise - - self.process = process - self.outfile = f - self.errfile = e - - timer.start() - return 0 - - def finishAndCleanupSubprocess(self, timer): + We say "wants ran" here because the Runner may inject something + within the command, for example when running within a container. + Due to this distinction, you can obtain the command that was + actually ran via getCommandRan() """ - Waits for the current subproccess to finish, stops the timer, and - cleans up. - """ - self.process.wait() - - timer.stop() - - self.exit_code = self.process.poll() - self.outfile.flush() - self.errfile.flush() - - # store the contents of output, and close the file - self.joined_out = util.readOutput(self.outfile, self.errfile, self) - self.outfile.close() - self.errfile.close() + return None - def runCommand(self, timer, options): + def setCommandRan(self, command): """ - Helper method for running external (sub)processes as part of the tester's execution. This - uses the tester's getCommand and getTestDir methods to run a subprocess. The timer must - be the same timer passed to the run method. Results from running the subprocess is stored - in the tester's output and exit_code fields. - """ - - exit_code = self.spawnSubprocessFromOptions(timer, options) - if exit_code: # Something went wrong - return + Sets the command that was actually ran. - self.finishAndCleanupSubprocess(timer) - - def killCommand(self): + This is needed to account for running commands within containers + and needing to run an additional command up front (i.e., with + a pbs or slurm scheduler calling something like qsub) """ - Kills any currently executing process started by the runCommand method. + self.command_ran = command + + def getCommandRan(self): """ - if self.process is not None: - try: - if platform.system() == "Windows": - from distutils import spawn - if spawn.find_executable("taskkill"): - subprocess.call(['taskkill', '/F', '/T', '/PID', str(self.process.pid)]) - else: - self.process.terminate() - else: - pgid = os.getpgid(self.process.pid) - os.killpg(pgid, SIGTERM) - except OSError: # Process already terminated - pass + Gets the command that was actually ran. - # Try to clean up anything else that we can - self.cleanup() + See setCommandRan() for the distinction. + """ + return self.command_ran - def run(self, timer, options): + def postSpawn(self, runner): """ - This is a method that is the tester's main execution code. Subclasses can override this - method with custom code relevant to their specific testing needs. By default this method - calls runCommand. runCommand is provided as a helper for running (external) subprocesses - as part of the tester's execution and should be the *only* way subprocesses are executed - if needed. The run method is responsible to call the start+stop methods on timer to record - the time taken to run the actual test. start+stop can be called multiple times. + Entry point for after the process has been spawned """ - self.runCommand(timer, options) + return def processResultsCommand(self, moose_dir, options): """ method to return the commands (list) used for processing results """ return [] - def processResults(self, moose_dir, options, output): + def processResults(self, moose_dir, options, exit_code, runner_output): """ method to process the results of a finished tester """ return @@ -501,7 +440,9 @@ def hasRedirectedOutput(self, options): def getRedirectedOutputFiles(self, options): """ return a list of redirected output """ - return [os.path.join(self.getTestDir(), self.name() + '.processor.{}'.format(p)) for p in range(self.getProcs(options))] + if self.hasRedirectedOutput(options): + return [os.path.join(self.getTestDir(), self.name() + '.processor.{}'.format(p)) for p in range(self.getProcs(options))] + return [] def addCaveats(self, *kwargs): """ Add caveat(s) which will be displayed with the final test status """ @@ -512,6 +453,10 @@ def addCaveats(self, *kwargs): self.__caveats.add(i) return self.getCaveats() + def removeCaveat(self, caveat): + """ Removes a caveat, which _must_ exist """ + self.__caveats.remove(caveat) + def getCaveats(self): """ Return caveats accumalted by this tester """ return self.__caveats @@ -521,6 +466,18 @@ def clearCaveats(self): self.__caveats = set([]) return self.getCaveats() + def mustOutputExist(self, exit_code): + """ Whether or not we should check for the output once it has ran + + We need this because the PBS/slurm Runner objects, which use + networked file IO, need to wait until the output is available on + on the machine that submitted the jobs. A good example is RunException, + where we should only look for output when we get a nonzero return + code.""" + return exit_code == 0 + + # need something that will tell us if we should try to read the result + def checkRunnableBase(self, options): """ Method to check for caveats that would prevent this tester from @@ -778,12 +735,17 @@ def checkRunnableBase(self, options): self.setStatus(self.fail, 'ABSOLUTE PATH DETECTED') # We can't offer the option of reading output files outside of initial TestDir - if self.specs['working_directory'] and (options.pbs - or options.ok_files - or options.fail_files - or options.sep_files): + if self.specs['working_directory'] and options.sep_files: reasons['working_directory'] = '--sep-files* enabled' + # Explicitly skip HPC tests + if not self.specs['hpc'] and options.hpc: + reasons['hpc'] = 'hpc=false' + + # Use shell not supported for HPC + if self.specs['use_shell'] and options.hpc: + reasons['use_shell'] = 'no use_shell with hpc' + ##### The below must be performed last to register all above caveats ##### # Remove any matching user supplied caveats from accumulated checkRunnable caveats that # would normally produce a skipped test. @@ -821,3 +783,34 @@ def checkRunnableBase(self, options): # Check the return values of the derived classes self._runnable = self.checkRunnable(options) return self._runnable + + def needFullOutput(self, options): + """ + Whether or not the full output is needed. + + If this is True, it means that we cannot truncate + the stderr/stdout output. This is often needed + when we're trying to read something from the output. + """ + return False + + def run(self, options, exit_code, runner_output): + output = self.processResults(self.getMooseDir(), options, exit_code, runner_output) + + # If the tester requested to be skipped at the last minute, report that. + if self.isSkip(): + output += f'\nTester skipped, reason: {self.getStatusMessage()}\n' + elif self.isFail(): + output += f'\nTester failed, reason: {self.getStatusMessage()}\n' + + self.setOutput(output) + + def getHPCPlace(self, options): + """ + Return the placement to use for HPC jobs + """ + if options.hpc_scatter_procs: + procs = self.getProcs(options) + if procs > 1 and procs <= options.hpc_scatter_procs: + return 'scatter' + return 'free' diff --git a/python/TestHarness/testers/XMLDiff.py b/python/TestHarness/testers/XMLDiff.py index f6145293c715..186a0d216e8d 100644 --- a/python/TestHarness/testers/XMLDiff.py +++ b/python/TestHarness/testers/XMLDiff.py @@ -29,9 +29,6 @@ def prepare(self, options): if self.specs['delete_output_before_running'] == True: util.deleteFilesAndFolders(self.getTestDir(), self.specs['xmldiff']) - def processResults(self, moose_dir, options, output): - return SchemaDiff.processResults(self, moose_dir, options, output) - def load_file(self, path1): import xmltodict with open(path1,"r") as f: diff --git a/python/TestHarness/testers/bench.py b/python/TestHarness/testers/bench.py deleted file mode 100644 index 878563eeb417..000000000000 --- a/python/TestHarness/testers/bench.py +++ /dev/null @@ -1,404 +0,0 @@ -#!/usr/bin/env python3 -#* This file is part of the MOOSE framework -#* https://www.mooseframework.org -#* -#* All rights reserved, see COPYRIGHT for full restrictions -#* https://github.com/idaholab/moose/blob/master/COPYRIGHT -#* -#* Licensed under LGPL 2.1, please see LICENSE for details -#* https://www.gnu.org/licenses/lgpl-2.1.html - -import subprocess -import time -import sys -import os -import gc -import shutil -import csv -import tempfile -import threading - -# try to import the resource module. We check further down if it failed -try: - import resource -except: - pass - -from TestHarness.testers.Tester import Tester - -def process_timeout(proc, timeout_sec): - kill_proc = lambda p: p.kill() - timer = threading.Timer(timeout_sec, kill_proc, [proc]) - try: - timer.start() - proc.wait() - finally: - timer.cancel() - -class Test: - def __init__(self, executable, infile, rootdir='.', args=None, perflog=False): - self.rootdir = rootdir - self.executable = executable - self.infile = infile - self.args = args - self.dur_secs = 0 - self.perflog = [] - self.getpot_options = ['Outputs/console=false', 'Outputs/exodus=false', 'Outputs/csv=false'] - self.have_perflog = perflog - if self.have_perflog: - self.getpot_options.append('UserObjects/perflog/type=PerflogDumper') - - def _buildcmd(self): - cmdpath = self.executable - infilepath = os.path.abspath(os.path.join(self.rootdir, self.infile)) - cmd = [cmdpath, '-i', infilepath] - if self.args is not None: - cmd.extend(self.args) - cmd.extend(self.getpot_options) - - # check for linux cpu isolation - isolpath = '/sys/devices/system/cpu/isolated' - cpuid = None - if os.path.exists(isolpath): - with open(isolpath, 'r') as f: - cpus = f.read().split(',') - if len(cpus[0].strip()) > 0: - cpuid = cpus[0] - if cpuid: - cmd = ['taskset', '-c', cpuid] + cmd - return cmd - - def reset(self): - self.perflog = [] - self.dur_secs = 0 - - def run(self, timer=None, timeout=300): - self.reset() - cmd = self._buildcmd() - - tmpdir = tempfile.mkdtemp() - shutil.rmtree(tmpdir, ignore_errors=True) - os.makedirs(tmpdir) - - rusage = resource.getrusage(resource.RUSAGE_CHILDREN) - start = rusage.ru_utime - gc.disable() - with open(os.devnull, 'w') as devnull: - if timer: - timer.start() - p = subprocess.Popen(cmd, cwd=tmpdir, stdout=devnull, stderr=devnull) - process_timeout(p, timeout) - if timer: - timer.stop() - gc.enable() - rusage = resource.getrusage(resource.RUSAGE_CHILDREN) - end = rusage.ru_utime - - if p.returncode != 0: - raise RuntimeError('command {} returned nonzero exit code'.format(cmd)) - - self.dur_secs = end - start - - # write perflog - if self.have_perflog: - with open(os.path.join(tmpdir, 'perflog.csv'), 'r') as csvfile: - reader = csv.reader(csvfile) - skip = True # use to skip header line - for row in reader: - if not skip: - self.perflog.append(row) - else: - skip = False - - shutil.rmtree(tmpdir) - -class SpeedTest(Tester): - @staticmethod - def validParams(): - params = Tester.validParams() - params.addParam('input', 'The input file to use for this test.') - params.addParam('test_name', 'The name of the test - populated automatically') - params.addParam('cumulative_dur', 60, 'cumulative time (secs) to run each benchmark') - params.addParam('min_runs', 40, 'minimum number of runs for each benchmark') - params.addParam('max_runs', 400, 'maximum number of runs for each benchmark') - params.addParam('perflog', False, 'true to enable perflog and store its output') - return params - - def __init__(self, name, params): - Tester.__init__(self, name, params) - self.tags.append('speedtests') - self.timeout = max(3600, float(params['max_time'])) - self.check_only = False - - self.params = params - self.benchmark = None - self.db = os.environ.get('MOOSE_SPEED_DB', 'speedtests.sqlite') - - # override - def getMaxTime(self): - return self.timeout - - # override - def checkRunnable(self, options): - # check if resource is available - if 'resource' not in sys.modules: - return False - - # if user is not explicitly running benchmarks, we only run moose once and just check - # input - to make sure the benchmark isn't broken. - if 'speedtests' not in options.runtags: - self.params['max_runs'] = 1 - self.params['cli_args'].insert(0, '--check-input') - self.check_only = True - return True - - # override - def run(self, timer, options): - p = self.params - if not self.check_only and options.method not in ['opt', 'oprof', 'dbg']: - raise ValueError('cannot run benchmark with "' + options.method + '" build') - t = Test(p['executable'], p['input'], args=p['cli_args'], rootdir=p['test_dir'], perflog=p['perflog']) - - if self.check_only: - t.run(timer, timeout=p['max_time']) - return - - name = p['test_name'].split('.')[-1] - self.benchmark = Bench(name, test=t, cum_dur=float(p['cumulative_dur']), min_runs=int(p['min_runs']), max_runs=int(p['max_runs'])) - self.benchmark.run(timer, timeout=self.timeout) - with DB(self.db) as db: - db.store(self.benchmark) - - # override - def processResults(self, moose_dir, options, output): - self.setStatus(self.success) - return output - -class Bench: - def __init__(self, name, realruns=None, test=None, cum_dur=60, min_runs=40, max_runs=400): - self.name = name - self.test = test - self.realruns = [] - self.perflogruns = [] - if realruns is not None: - self.realruns.extend(realruns) - self._cum_dur = cum_dur - self._min_runs = min_runs - self._max_runs = max_runs - - def run(self, timer=None, timeout=3600): - tot = 0.0 - start = time.time() - while (len(self.realruns) < self._min_runs or tot < self._cum_dur) and len(self.realruns) < self._max_runs: - dt = time.time() - start - if dt >= timeout: - raise RuntimeError('benchmark timed out after {} with {} runs'.format(dt, len(self.realruns))) - - self.test.run(timer, timeout=timeout - dt) - self.realruns.append(self.test.dur_secs) - self.perflogruns.append(self.test.perflog) - tot += self.test.dur_secs - -class BenchComp: - def __init__(self, oldbench, newbench, psig=0.01): - self.name = oldbench.name - self.psig = psig - self.old = oldbench.realruns - self.new = newbench.realruns - - self.iqr_old = _iqr(self.old) - self.iqr_new = _iqr(self.new) - - from scipy.stats import mannwhitneyu - try: - result = mannwhitneyu(self.iqr_old, self.iqr_new, alternative='two-sided') - self.pvalue = result.pvalue - except: - self.pvalue = 1.0 - - self.u = result[0] - self.avg_old = float(sum(self.iqr_old))/len(self.iqr_old) - self.avg_new = float(sum(self.iqr_new))/len(self.iqr_new) - self.speed_change = (self.avg_new - self.avg_old) / self.avg_old - - @classmethod - def header(cls, revold, revnew): - oldstr, newstr = revold, revnew - if len(oldstr) > 12: - oldstr = oldstr[:12] - if len(newstr) > 12: - newstr = newstr[:12] - revstr = ' {} to {} '.format(oldstr, newstr) - revstr = revstr.center(30,'-') - return '' \ - + '--------------------------------{}--------------------------------'.format(revstr) \ - + '\n{:^30s} {:^15s} {:^15s} {:5s}'.format('benchmark', 'old (sec/run)', 'new (sec/run)', 'speedup (pvalue, nsamples)') \ - + '\n----------------------------------------------------------------------------------------------' - @classmethod - def footer(cls): - return '----------------------------------------------------------------------------------------------' - - def __str__(self): - name = self.name - if len(name) > 30: - name = name[:27] + '...' - if self.pvalue <= self.psig: - return '{:>30s}: {:^15f} {:^15f} {:+5.1f}% (p={:.4f},n={}+{})'.format(name, self.avg_old, self.avg_new, self.speed_change*100, self.pvalue, len(self.iqr_old), len(self.iqr_new)) - else: - return '{:>30s}: {:^15f} {:^15f} ~ (p={:.4f},n={}+{})'.format(name, self.avg_old, self.avg_new, self.pvalue, len(self.iqr_old), len(self.iqr_new)) - -def _iqr(a, frac=1000): - """return elements of a within frac*iqr of the the interquartile range (inclusive)""" - import numpy - qup, qlow = numpy.percentile(a, [75 ,25]) - - iqr = qup - qlow - clean = [] - for val in a: - if qlow - frac*iqr <= val and val <= qup + frac*iqr: - clean.append(val) - return clean - -class DB: - def __init__(self, fname): - CREATE_BENCH_TABLE = '''CREATE TABLE IF NOT EXISTS benchmarks - ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - name TEXT, - executable TEXT, - executable_name TEXT, - executable_method TEXT, - input_file TEXT, - timestamp INTEGER, - revision TEXT, - date INTEGER, - load REAL - );''' - - CREATE_TIMES_TABLE = '''CREATE TABLE IF NOT EXISTS timings - ( - benchmark_id INTEGER, - run INTEGER, - realtime_secs REAL - );''' - - CREATE_PERFLOG_TABLE = '''CREATE TABLE IF NOT EXISTS perflog - ( - benchmark_id INTEGER, - run INTEGER, - field TEXT, - subfield TEXT, - exec_count INTEGER, - self_time_secs REAL, - cum_time_secs REAL - );''' - - self.fname = fname - - # python might not have sqlite3 builtin, so do the import here so - # that the TestHarness can always import this file - import sqlite3 - self.conn = sqlite3.connect(fname) - c = self.conn.cursor() - c.execute(CREATE_BENCH_TABLE) - c.execute(CREATE_TIMES_TABLE) - c.execute(CREATE_PERFLOG_TABLE) - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def revisions(self, method='opt'): - c = self.conn.cursor() - c.execute('SELECT revision,date FROM benchmarks WHERE executable_method=? GROUP BY revision ORDER BY date ASC', (method,)) - rows = c.fetchall() - revs = [] - times = [] - for r in rows: - revs.append(r[0]) - times.append(r[1]) - return revs, times - - def bench_names(self, method='opt'): - c = self.conn.cursor() - c.execute('SELECT DISTINCT name FROM benchmarks WHERE executable_method=?', (method,)) - rows = c.fetchall() - names = [] - for r in rows: - names.append(r[0]) - return names - - def list(self, revision, benchmark='', method='opt'): - c = self.conn.cursor() - if benchmark == '': - c.execute('SELECT id,name,executable,input_file FROM benchmarks WHERE INSTR(revision,?) AND executable_method=? ORDER BY date ASC', (revision,method)) - else: - c.execute('SELECT id,name,executable,input_file FROM benchmarks WHERE INSTR(revision,?) AND name=? AND executable_method=? ORDER BY date ASC', (revision,benchmark,method)) - benchmarks = c.fetchall() - return benchmarks - - def load_times(self, bench_id): - c = self.conn.cursor() - c.execute('SELECT realtime_secs FROM timings WHERE benchmark_id=?', (bench_id,)) - ents = c.fetchall() - real = [] - for ent in ents: - real.append(float(ent[0])) - return real - - def load(self, revision, bench_name, method='opt'): - """loads and returns a Bench object for the given revision and benchmark name""" - entries = self.list(revision, benchmark=bench_name, method=method) - if len(entries) < 1: - raise RuntimeError('load: no benchamrk for revision="{}",bench_name="{}"'.format(revision, bench_name)) - b = entries[0] - real = self.load_times(b[0]) - return Bench(b[1], test=Test(b[2], b[3]), realruns=real) - - def store(self, benchmark, rev=None): - """stores a (run/executed) Bench in the database. if rev is None, git revision is retrieved from git""" - ex = benchmark.test.executable - (ex_name, ex_method) = os.path.basename(ex).rsplit('-', 1) - infile = benchmark.test.infile - timestamp = time.time() - date = timestamp - if rev is None: - if 'MOOSE_REVISION' in os.environ: - rev = os.environ['MOOSE_REVISION'] - else: - rev, date = git_revision() - load = os.getloadavg()[0] - - c = self.conn.cursor() - c.execute('INSERT INTO benchmarks (name,executable,executable_name,executable_method,input_file,timestamp,revision,date,load) VALUES (?,?,?,?,?,?,?,?,?)', - (benchmark.name, ex, ex_name, ex_method, infile, timestamp, rev, date, load)) - bench_id = c.lastrowid - self.conn.commit() - - i = 0 - for real, perflog in zip(benchmark.realruns, benchmark.perflogruns): - c.execute('INSERT INTO timings (benchmark_id, run, realtime_secs) VALUES (?,?,?)', (bench_id, i, real)) - i += 1 - for entry in perflog: - cat, subcat, nruns, selftime, cumtime = entry - c.execute('INSERT INTO perflog (benchmark_id, run, field, subfield, exec_count, self_time_secs, cum_time_secs) VALUES (?,?,?,?,?,?,?)', - (bench_id, i, cat, subcat, nruns, selftime, cumtime)) - - return bench_id - - def close(self): - self.conn.commit() - self.conn.close() - -def git_revision(dir='.'): - # return hash and (unix secs since epoch) date - cmd = ['git', 'log', '--date', 'raw', '--pretty=format:%H %ad', '-n', '1'] - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, cwd=dir) - stdout, stderr = p.communicate() - if p.returncode != 0: - raise RuntimeError('failed to retrieve git revision') - commit = str(stdout).strip().split(' ')[0] - date = int(str(stdout).strip().split(' ')[1]) - return commit, date diff --git a/python/TestHarness/tests/TestHarnessTestCase.py b/python/TestHarness/tests/TestHarnessTestCase.py index dbbb0e2c48e0..456a60264be1 100644 --- a/python/TestHarness/tests/TestHarnessTestCase.py +++ b/python/TestHarness/tests/TestHarnessTestCase.py @@ -10,6 +10,7 @@ import os import unittest import subprocess +import tempfile import re class TestHarnessTestCase(unittest.TestCase): @@ -17,19 +18,22 @@ class TestHarnessTestCase(unittest.TestCase): TestCase class for running TestHarness commands. """ - def runExceptionTests(self, *args): - os.environ['MOOSE_TERM_FORMAT'] = 'njCst' - cmd = ['./run_tests'] + list(args) + def runTests(self, *args, tmp_output=True): + cmd = ['./run_tests'] + list(args) + ['--term-format', 'njCst'] + sp_kwargs = {'cwd': os.path.join(os.getenv('MOOSE_DIR'), 'test'), + 'text': True} + if tmp_output: + with tempfile.TemporaryDirectory() as output_dir: + cmd += ['-o', output_dir] + return subprocess.check_output(cmd, **sp_kwargs) + return subprocess.check_output(cmd, **sp_kwargs) + + def runExceptionTests(self, *args, tmp_output=True): try: - return subprocess.check_output(cmd, cwd=os.path.join(os.getenv('MOOSE_DIR'), 'test')) - raise RuntimeError('test failed to fail') + self.runTests(*args, tmp_output=tmp_output) except Exception as err: return err.output - - def runTests(self, *args): - os.environ['MOOSE_TERM_FORMAT'] = 'njCst' - cmd = ['./run_tests'] + list(args) - return subprocess.check_output(cmd, cwd=os.path.join(os.getenv('MOOSE_DIR'), 'test')) + raise RuntimeError('test failed to fail') def checkStatus(self, output, passed=0, skipped=0, pending=0, failed=0): """ diff --git a/python/TestHarness/tests/test_Allocations.py b/python/TestHarness/tests/test_Allocations.py index 8cfd19d2a664..6b950b3e3113 100644 --- a/python/TestHarness/tests/test_Allocations.py +++ b/python/TestHarness/tests/test_Allocations.py @@ -17,15 +17,15 @@ def testSkippedAllocations(self): """ # Subject a normally passing test to impossible cpu allocations output = self.runTests('--no-color', '-i', 'always_ok', '-p', '2', '-j', '1') - self.assertRegex(output.decode('utf-8'), 'tests/test_harness.always_ok.*? \[INSUFFICIENT SLOTS\] SKIP') + self.assertRegex(output, 'tests/test_harness.always_ok.*? \[INSUFFICIENT SLOTS\] SKIP') # Subject a normally passing test to impossible thread allocations output = self.runTests('--no-color', '-i', 'always_ok', '--n-threads', '2', '-j', '1') - self.assertRegex(output.decode('utf-8'), 'tests/test_harness.always_ok.*? \[INSUFFICIENT SLOTS\] SKIP') + self.assertRegex(output, 'tests/test_harness.always_ok.*? \[INSUFFICIENT SLOTS\] SKIP') # A combination of threads*cpus with too low a hard limit (3*3= -j9) output = self.runTests('--no-color', '-i', 'allocation_test', '--n-threads', '3', '-p', '3', '-j', '8') - self.assertRegex(output.decode('utf-8'), 'tests/test_harness.allocation_test.*? \[INSUFFICIENT SLOTS\] SKIP') + self.assertRegex(output, 'tests/test_harness.allocation_test.*? \[INSUFFICIENT SLOTS\] SKIP') def testOversizedCaveat(self): """ @@ -33,13 +33,13 @@ def testOversizedCaveat(self): """ # A test which has no min/max cpu parameters should print oversized # when subjected to -p 2 - output = self.runTests('-i', 'always_ok', '-p', '2').decode('utf-8') + output = self.runTests('-i', 'always_ok', '-p', '2') self.assertNotIn('CPUS', output) self.assertIn('OVERSIZED', output) # A test which has no min/max thread parameters should print oversized # when subjected to --n-threads 2 - output = self.runTests('-i', 'always_ok', '--n-threads', '2').decode('utf-8') + output = self.runTests('-i', 'always_ok', '--n-threads', '2') self.assertNotIn('THREADS', output) self.assertIn('OVERSIZED', output) @@ -51,14 +51,14 @@ def testCpuCaveats(self): """ # Test MIN CPUs / Oversized caveat using soft limit (no -j) on a test # having a minimum cpu parameter of 2. - output = self.runTests('-i', 'allocation_test', '--n-threads', '2').decode('utf-8') + output = self.runTests('-i', 'allocation_test', '--n-threads', '2') self.assertNotIn('MIN_THREADS', output) self.assertIn('MIN_CPUS=2', output) self.assertIn('OVERSIZED', output) # Test MAX CPUs / Oversized caveat on a test having a maximum cpu # parameter of 3 (and we subjected it to 4). - output = self.runTests('-i', 'allocation_test', '-p', '4', '--n-threads', '2').decode('utf-8') + output = self.runTests('-i', 'allocation_test', '-p', '4', '--n-threads', '2') self.assertNotIn('MIN_THREADS', output) self.assertIn('MAX_CPUS=3', output) self.assertIn('OVERSIZED', output) @@ -73,7 +73,7 @@ def testThreadCaveats(self): # Note: 1*2 should be -j 2 but the test minimum is 2 threads, so we need # to use -j 4 to suppress any cpu caveats. Oversized will not trigger as # -j4 satisfies this test's requirements. - output = self.runTests('-i', 'allocation_test', '-j', '4', '-p', '2', '--n-threads', '1').decode('utf-8') + output = self.runTests('-i', 'allocation_test', '-j', '4', '-p', '2', '--n-threads', '1') self.assertNotIn('CPUS', output) self.assertNotIn('OVERSIZED', output) self.assertIn('MIN_THREADS=2', output) @@ -83,7 +83,7 @@ def testThreadCaveats(self): # are specifically testing that setting a lower j does _not_ trigger an # insufficient skipped test scenario. Oversized will not trigger as # -j6 satisfies this test's requirements. - output = self.runTests('-i', 'allocation_test', '-j', '6', '-p', '2', '--n-threads', '4').decode('utf-8') + output = self.runTests('-i', 'allocation_test', '-j', '6', '-p', '2', '--n-threads', '4') self.assertNotIn('CPUS', output) self.assertNotIn('OVERSIZED', output) self.assertIn('MAX_THREADS=3', output) @@ -94,7 +94,7 @@ def testPerfectAllocation(self): """ # Passing test triggering no caveats, as supplied allocations satisfies # the test's requirements - output = self.runTests('-i', 'allocation_test', '-j', '4', '-p', '2', '--n-threads', '2').decode('utf-8') + output = self.runTests('-i', 'allocation_test', '-j', '4', '-p', '2', '--n-threads', '2') self.assertNotIn('MIN_THREADS', output) self.assertNotIn('MAX_THREADS', output) self.assertNotIn('MIN_CPUS', output) diff --git a/python/TestHarness/tests/test_ArbitrarySpecFile.py b/python/TestHarness/tests/test_ArbitrarySpecFile.py index 7003fa6308d5..551d35481f52 100644 --- a/python/TestHarness/tests/test_ArbitrarySpecFile.py +++ b/python/TestHarness/tests/test_ArbitrarySpecFile.py @@ -15,16 +15,16 @@ def testArbitrarySpecFile(self): Verify an arbitrary test will run when we use the --spec-file argument """ # Test that we do not recursively find additional tests - output = self.runTests('--spec-file', 'tests/test_harness/arbitrary_test').decode('utf-8') + output = self.runTests('--spec-file', 'tests/test_harness/arbitrary_test') self.assertIn('tests/test_harness.always_ok', output) self.assertNotIn('tests/test_harness/arbitrary_directory.always_ok', output) # Test that we do find additional tests with recursion - output = self.runTests('--spec-file', 'tests/test_harness', '-i', 'arbitrary_test').decode('utf-8') + output = self.runTests('--spec-file', 'tests/test_harness', '-i', 'arbitrary_test') self.assertIn('tests/test_harness.always_ok', output) self.assertIn('tests/test_harness/arbitrary_directory.always_ok', output) # Test that we are not recursively finding our way backwards - output = self.runTests('--spec-file', 'tests/test_harness/arbitrary_directory', '-i', 'arbitrary_test').decode('utf-8') + output = self.runTests('--spec-file', 'tests/test_harness/arbitrary_directory', '-i', 'arbitrary_test') self.assertIn('tests/test_harness/arbitrary_directory.always_ok', output) self.assertNotIn('tests/test_harness.always_ok', output) diff --git a/python/TestHarness/tests/test_CSVDiffs.py b/python/TestHarness/tests/test_CSVDiffs.py index 5da059689d63..b1ca2df33bb2 100644 --- a/python/TestHarness/tests/test_CSVDiffs.py +++ b/python/TestHarness/tests/test_CSVDiffs.py @@ -19,9 +19,9 @@ def testDiffs(self): self.runTests('-i', 'csvdiffs') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.test_csvdiff.*?FAILED \(Override inputs not the same length\)') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.test_badfile.*?FAILED \(MISSING GOLD FILE\)') - self.checkStatus(e.output.decode('utf-8'), failed=2) + self.assertRegex(e.output, r'test_harness\.test_csvdiff.*?FAILED \(Override inputs not the same length\)') + self.assertRegex(e.output, r'test_harness\.test_badfile.*?FAILED \(MISSING GOLD FILE\)') + self.checkStatus(e.output, failed=2) def testMissingComparison(self): """ @@ -31,8 +31,8 @@ def testMissingComparison(self): self.runTests('-i', 'csvdiff_missing_comparison_file') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.test_csvdiff_comparison_file_missing.*?FAILED \(MISSING COMPARISON FILE\)') - self.checkStatus(e.output.decode('utf-8'), failed=1) + self.assertRegex(e.output, r'test_harness\.test_csvdiff_comparison_file_missing.*?FAILED \(MISSING COMPARISON FILE\)') + self.checkStatus(e.output, failed=1) def testCSVDiffScript(self): """ diff --git a/python/TestHarness/tests/test_CSVValidationTester.py b/python/TestHarness/tests/test_CSVValidationTester.py index ae009009eed8..004a3afe2cde 100644 --- a/python/TestHarness/tests/test_CSVValidationTester.py +++ b/python/TestHarness/tests/test_CSVValidationTester.py @@ -20,10 +20,10 @@ def testCSVValidationTester(self): """ with self.assertRaises(subprocess.CalledProcessError) as cm: - self.runTests('-i', 'csv_validation_tester', '--no-color').decode('utf-8') + self.runTests('-i', 'csv_validation_tester', '--no-color') e = cm.exception - output = e.output.decode('utf-8') + output = e.output self.assertRegexpMatches(output, r'test_harness\.csv_validation_tester_01.*?OK') self.assertRegexpMatches(output, r'test_harness\.csv_validation_tester_02.*?FAILED \(DIFF\)') @@ -34,9 +34,9 @@ def testCSVValidationTesterVerbose(self): """ with self.assertRaises(subprocess.CalledProcessError) as cm: - self.runTests('-i', 'csv_validation_tester', '--verbose', '--no-color').decode('utf-8') + self.runTests('-i', 'csv_validation_tester', '--verbose', '--no-color') e = cm.exception - output = e.output.decode('utf-8') + output = e.output self.assertRegexpMatches(output, 'csv_validation_tester_01.csv | 0.00 \xb1 0.01 | 0.01 \xb1 0.01') self.assertRegexpMatches(output, 'csv_validation_tester_02.csv | 0.00 \xb1 0.01 | 0.01 \xb1 0.00') diff --git a/python/TestHarness/tests/test_CustomEval.py b/python/TestHarness/tests/test_CustomEval.py index ffb411b8cbd0..8aabcf29e867 100644 --- a/python/TestHarness/tests/test_CustomEval.py +++ b/python/TestHarness/tests/test_CustomEval.py @@ -6,10 +6,10 @@ def testCustomEval(self): with self.assertRaises(subprocess.CalledProcessError) as cm: self.runTests('-i', 'custom_eval') e = cm.exception - self.assertIn('Custom evaluation failed', e.output.decode('utf-8')) + self.assertIn('Custom evaluation failed', e.output) #test expect out failure with self.assertRaises(subprocess.CalledProcessError) as cm: self.runTests('-i', 'custom_eval') e = cm.exception - self.assertIn('expect_out and absent_out can not be supplied', e.output.decode('utf-8')) + self.assertIn('expect_out and absent_out can not be supplied', e.output) diff --git a/python/TestHarness/tests/test_Cyclic.py b/python/TestHarness/tests/test_Cyclic.py index 0a4270d976f1..9134a9b133f2 100644 --- a/python/TestHarness/tests/test_Cyclic.py +++ b/python/TestHarness/tests/test_Cyclic.py @@ -19,5 +19,5 @@ def testCyclic(self): self.runTests('--no-color', '-i', 'cyclic_tests') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.testC.*? FAILED \(Cyclic or Invalid Dependency Detected!\)') - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.test[A|B].*? \[SKIPPED DEPENDENCY\] SKIP') + self.assertRegex(e.output, r'tests/test_harness.testC.*? FAILED \(Cyclic or Invalid Dependency Detected!\)') + self.assertRegex(e.output, r'tests/test_harness.test[A|B].*? \[SKIPPED DEPENDENCY\] SKIP') diff --git a/python/TestHarness/tests/test_Deleted.py b/python/TestHarness/tests/test_Deleted.py index 1012d58f8676..01e09c927ba1 100644 --- a/python/TestHarness/tests/test_Deleted.py +++ b/python/TestHarness/tests/test_Deleted.py @@ -19,7 +19,7 @@ def testDeleted(self): self.runTests('--no-color', '-i', 'deleted', '-e') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.deleted.*? \[TEST DELETED TEST\] FAILED \(DELETED\)') + self.assertRegex(e.output, r'test_harness\.deleted.*? \[TEST DELETED TEST\] FAILED \(DELETED\)') # Verify return code is DELETED related (0x83) self.assertIs(0x83, e.returncode) @@ -28,5 +28,5 @@ def testNoExtraInfo(self): """ Test that deleted tests do not run without -e (extra) option """ - output = self.runTests('--no-color', '-i', 'deleted').decode('utf-8') + output = self.runTests('--no-color', '-i', 'deleted') self.assertNotIn('tests/test_harness.deleted', output) diff --git a/python/TestHarness/tests/test_DependencySkip.py b/python/TestHarness/tests/test_DependencySkip.py index c1cc7ec0c094..b7b33f1ee3f6 100644 --- a/python/TestHarness/tests/test_DependencySkip.py +++ b/python/TestHarness/tests/test_DependencySkip.py @@ -13,6 +13,6 @@ def testDependencySkip(self): """ Test skipping a test if its prereq is also skipped """ - output = self.runTests('--no-color', '-i', 'depend_skip_tests').decode('utf-8') + output = self.runTests('--no-color', '-i', 'depend_skip_tests') self.assertIn('[ALWAYS SKIPPED] SKIP', output) self.assertIn('[SKIPPED DEPENDENCY] SKIP', output) diff --git a/python/TestHarness/tests/test_Diff.py b/python/TestHarness/tests/test_Diff.py index 602878054242..41a3e54081cb 100644 --- a/python/TestHarness/tests/test_Diff.py +++ b/python/TestHarness/tests/test_Diff.py @@ -19,10 +19,10 @@ def testDiffs(self): self.runTests('-i', 'diffs') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.exodiff.*?FAILED \(EXODIFF\)') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.csvdiff.*?FAILED \(CSVDIFF\)') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.exodiff.*?Running exodiff') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.csvdiff.*?Running csvdiff') + self.assertRegex(e.output, r'test_harness\.exodiff.*?FAILED \(EXODIFF\)') + self.assertRegex(e.output, r'test_harness\.csvdiff.*?FAILED \(CSVDIFF\)') + self.assertRegex(e.output, r'test_harness\.exodiff.*?Running exodiff') + self.assertRegex(e.output, r'test_harness\.csvdiff.*?Running csvdiff') # Verify return code is DIFF related (0x81) self.assertIs(0x81, e.returncode) diff --git a/python/TestHarness/tests/test_DiffGold.py b/python/TestHarness/tests/test_DiffGold.py index 0f7fe02fcc92..7105ee725b6c 100644 --- a/python/TestHarness/tests/test_DiffGold.py +++ b/python/TestHarness/tests/test_DiffGold.py @@ -19,7 +19,7 @@ def testDiffs(self): self.runTests('-i', 'diff_golds') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.exodiff.*?FAILED \(EXODIFF\)') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.csvdiff.*?FAILED \(CSVDIFF\)') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.exodiff.*?Running exodiff') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.csvdiff.*?Running csvdiff') + self.assertRegex(e.output, r'test_harness\.exodiff.*?FAILED \(EXODIFF\)') + self.assertRegex(e.output, r'test_harness\.csvdiff.*?FAILED \(CSVDIFF\)') + self.assertRegex(e.output, r'test_harness\.exodiff.*?Running exodiff') + self.assertRegex(e.output, r'test_harness\.csvdiff.*?Running csvdiff') diff --git a/python/TestHarness/tests/test_DisplayRequired.py b/python/TestHarness/tests/test_DisplayRequired.py index d032cfc43531..bdfde22c884d 100644 --- a/python/TestHarness/tests/test_DisplayRequired.py +++ b/python/TestHarness/tests/test_DisplayRequired.py @@ -21,7 +21,7 @@ def testDislpayRequired(self): os.unsetenv('DISPLAY') output = self.runTests('--no-color', '-i', 'display_required') - self.assertRegex(output.decode('utf-8'), r'test_harness\.display_required.*? \[NO DISPLAY\] SKIP') + self.assertRegex(output, r'test_harness\.display_required.*? \[NO DISPLAY\] SKIP') if display: os.putenv('DISPLAY', display) diff --git a/python/TestHarness/tests/test_DistributedMesh.py b/python/TestHarness/tests/test_DistributedMesh.py index c3346027dcea..d52399c740cb 100644 --- a/python/TestHarness/tests/test_DistributedMesh.py +++ b/python/TestHarness/tests/test_DistributedMesh.py @@ -16,11 +16,11 @@ def testSyntax(self): """ # Verify the distributed mesh test is skipped - output = self.runExceptionTests('-i', 'mesh_mode_distributed', '--no-color').decode('utf-8') + output = self.runTests('-i', 'mesh_mode_distributed', '--no-color') self.assertIn('[MESH_MODE!=DISTRIBUTED] SKIP', output) # Verify the distributed mesh test is passing when providing --distributed # To be acurate, test for OK rather than asserting if 'distributed' is # missing from the output. output = self.runTests('--distributed', '-i', 'mesh_mode_distributed') - self.assertRegex(output.decode('utf-8'), 'test_harness.distributed_mesh.*?OK') + self.assertRegex(output, 'test_harness.distributed_mesh.*?OK') diff --git a/python/TestHarness/tests/test_DoLast.py b/python/TestHarness/tests/test_DoLast.py index 54877a2e70a1..4a5b04a96693 100644 --- a/python/TestHarness/tests/test_DoLast.py +++ b/python/TestHarness/tests/test_DoLast.py @@ -20,7 +20,7 @@ def testDoLastDuplicate(self): e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.*?FAILED \(Cyclic or Invalid Dependency Detected!\)') + self.assertRegex(e.output, r'tests/test_harness.*?FAILED \(Cyclic or Invalid Dependency Detected!\)') def testDoLastDepends(self): """ @@ -31,21 +31,21 @@ def testDoLastDepends(self): e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.*?FAILED \(Cyclic or Invalid Dependency Detected!\)') + self.assertRegex(e.output, r'tests/test_harness.*?FAILED \(Cyclic or Invalid Dependency Detected!\)') def testDoLast(self): """ Confirm 'do_last' tested last """ output = self.runTests('--no-color', '-i', 'do_last') - self.assertRegex(output.decode('utf-8'), 'tests/test_harness.a.*?OK\ntests/test_harness.do_last.*?OK') + self.assertRegex(output, 'tests/test_harness.a.*?OK\ntests/test_harness.do_last.*?OK') def testDoLastSkipped(self): """ Confirm 'do_last' is skipped if a test it depends on failed/skipped. """ output = self.runTests('--no-color', '-i', 'do_last_skipped') - self.assertRegex(output.decode('utf-8'), 'test_harness.do_last.*?\[SKIPPED DEPENDENCY\] SKIP') + self.assertRegex(output, 'test_harness.do_last.*?\[SKIPPED DEPENDENCY\] SKIP') def testDoLastName(self): """ @@ -56,4 +56,4 @@ def testDoLastName(self): e = cm.exception - self.assertRegex(e.output.decode('utf-8'), 'test_harness.*?FAILED \(Test named ALL when "prereq = ALL" elsewhere in test spec file!\)') + self.assertRegex(e.output, 'test_harness.*?FAILED \(Test named ALL when "prereq = ALL" elsewhere in test spec file!\)') diff --git a/python/TestHarness/tests/test_DryRun.py b/python/TestHarness/tests/test_DryRun.py index 87044d165ec4..99ca3cffcf9c 100644 --- a/python/TestHarness/tests/test_DryRun.py +++ b/python/TestHarness/tests/test_DryRun.py @@ -17,13 +17,13 @@ def testDryRun(self): """ output = self.runTests('-i', 'diffs', '--dry-run') - self.assertRegex(output.decode('utf-8'), 'test_harness\.exodiff.*?DRY RUN') - self.assertRegex(output.decode('utf-8'), 'test_harness\.csvdiff.*?DRY RUN') + self.assertRegex(output, 'test_harness\.exodiff.*?DRY RUN') + self.assertRegex(output, 'test_harness\.csvdiff.*?DRY RUN') # Skipped caveat test which returns skipped instead of 'DRY RUN' output = self.runTests('--no-color', '-i', 'depend_skip_tests', '--dry-run') - self.assertRegex(output.decode('utf-8'), r'tests/test_harness.always_skipped.*? \[ALWAYS SKIPPED\] SKIP') - self.assertRegex(output.decode('utf-8'), r'tests/test_harness.needs_always_skipped.*? \[SKIPPED DEPENDENCY\] SKIP') + self.assertRegex(output, r'tests/test_harness.always_skipped.*? \[ALWAYS SKIPPED\] SKIP') + self.assertRegex(output, r'tests/test_harness.needs_always_skipped.*? \[SKIPPED DEPENDENCY\] SKIP') # Deleted caveat test which returns a deleted failing tests while # performing a dry run @@ -31,4 +31,4 @@ def testDryRun(self): self.runTests('--no-color', '-i', 'deleted', '-e', '--dry-run') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.deleted.*? \[TEST DELETED TEST\] FAILED \(DELETED\)') + self.assertRegex(e.output, r'test_harness\.deleted.*? \[TEST DELETED TEST\] FAILED \(DELETED\)') diff --git a/python/TestHarness/tests/test_Duplicate.py b/python/TestHarness/tests/test_Duplicate.py index 3ed824e69ecf..07835ec2f8e4 100644 --- a/python/TestHarness/tests/test_Duplicate.py +++ b/python/TestHarness/tests/test_Duplicate.py @@ -18,18 +18,17 @@ def testDuplicateOutputs(self): with self.assertRaises(subprocess.CalledProcessError) as cm: self.runTests('-i', 'duplicate_outputs') - e = cm.exception + output = cm.exception.output + self.assertIn('Tests: d, c', output) + self.assertIn('File(s): good_out.e', output) - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.*?FAILED \(OUTFILE RACE CONDITION\)') - - # Use a different spec file, which makes use of the AnalyzeJacobian tester. The is because - # a race condition, when caught, will invalidate the rest of the tests with out testing them. + # Use a different spec file, which makes use of the AnalyzeJacobian tester with self.assertRaises(subprocess.CalledProcessError) as cm: self.runTests('-i', 'duplicate_outputs_analyzejacobian') - e = cm.exception - - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.*?FAILED \(OUTFILE RACE CONDITION\)') + output = cm.exception.output + self.assertIn('Tests: b, a', output) + self.assertIn('File(s): good.i', output) def testDuplicateOutputsOK(self): """ @@ -40,33 +39,8 @@ def testDuplicateOutputsOK(self): output += self.runTests('-i', 'duplicate_outputs_ok', '--heavy') # skip case - self.assertNotRegexpMatches(output.decode('utf-8'), 'skipped_out.e') + self.assertNotRegexpMatches(output, 'skipped_out.e') # heavy case - self.assertNotRegexpMatches(output.decode('utf-8'), 'heavy_out.e') + self.assertNotRegexpMatches(output, 'heavy_out.e') # all - self.assertNotRegexpMatches(output.decode('utf-8'), 'FATAL TEST HARNESS ERROR') - - def testDelayedDuplicateOutputs(self): - """ - Test a more complex, delayed, race condition by running three tests. Two which launch - immediately, and a third, waiting on one job to finish. When it does, this third test - will write to the same output file, that one of the other tests which is still running - is writing to. Thus, causing a delayed race condition. - """ - with self.assertRaises(subprocess.CalledProcessError) as cm: - self.runTests('-i', 'duplicate_outputs_prereqs') - - e = cm.exception - - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.*?FAILED \(OUTFILE RACE CONDITION\)') - - def testMultipleDuplicateOutputs(self): - """ - Test for multiple duplicate outputs created by one test - """ - with self.assertRaises(subprocess.CalledProcessError) as cm: - self.runTests('-i', 'multiple_duplicate_outputs') - - e = cm.exception - - self.assertRegex(e.output.decode('utf-8'), r'FAILED \(DUPLICATE OUTFILES\)') + self.assertNotRegexpMatches(output, 'FATAL TEST HARNESS ERROR') diff --git a/python/TestHarness/tests/test_Expect.py b/python/TestHarness/tests/test_Expect.py index da66d6b36218..447015a40f3f 100644 --- a/python/TestHarness/tests/test_Expect.py +++ b/python/TestHarness/tests/test_Expect.py @@ -19,13 +19,13 @@ def testExpect(self): self.runTests('-i', 'expect') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.no_expect_err_pattern.*?FAILED \(EXPECTED ERROR MISSING\)') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.no_expect_out_pattern.*?FAILED \(EXPECTED OUTPUT MISSING\)') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.absent_out_pattern.*?FAILED \(OUTPUT NOT ABSENT\)') + self.assertRegex(e.output, r'test_harness\.no_expect_err_pattern.*?FAILED \(EXPECTED ERROR MISSING\)') + self.assertRegex(e.output, r'test_harness\.no_expect_out_pattern.*?FAILED \(EXPECTED OUTPUT MISSING\)') + self.assertRegex(e.output, r'test_harness\.absent_out_pattern.*?FAILED \(OUTPUT NOT ABSENT\)') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.no_expect_err_literal.*?FAILED \(EXPECTED ERROR MISSING\)') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.no_expect_out_literal.*?FAILED \(EXPECTED OUTPUT MISSING\)') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.absent_out_literal.*?FAILED \(OUTPUT NOT ABSENT\)') + self.assertRegex(e.output, r'test_harness\.no_expect_err_literal.*?FAILED \(EXPECTED ERROR MISSING\)') + self.assertRegex(e.output, r'test_harness\.no_expect_out_literal.*?FAILED \(EXPECTED OUTPUT MISSING\)') + self.assertRegex(e.output, r'test_harness\.absent_out_literal.*?FAILED \(OUTPUT NOT ABSENT\)') def testExpectMissing(self): """ @@ -35,4 +35,4 @@ def testExpectMissing(self): self.runTests('-i', 'expect_missing_params') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'Either "expect_err" or "expect_assert" must be supplied') + self.assertRegex(e.output, r'Either "expect_err" or "expect_assert" must be supplied') diff --git a/python/TestHarness/tests/test_ExtraInfo.py b/python/TestHarness/tests/test_ExtraInfo.py index 291b00c40b25..12f246ab4168 100644 --- a/python/TestHarness/tests/test_ExtraInfo.py +++ b/python/TestHarness/tests/test_ExtraInfo.py @@ -39,7 +39,7 @@ def testExtraInfo(self): # will use the --ignore feature to force the test to run # regardless if that check(s) would otherwise cause this # test to be skipped. - output = self.runTests('-c', '-i', 'extra_info', '--ignore', '-e').decode('utf-8') + output = self.runTests('-c', '-i', 'extra_info', '--ignore', '-e') # Parse the output, and find the caveat string raw_caveat_string = re.findall(r'\[(.*)\]', output) diff --git a/python/TestHarness/tests/test_FailedTests.py b/python/TestHarness/tests/test_FailedTests.py index 096610102228..44bdda057c67 100644 --- a/python/TestHarness/tests/test_FailedTests.py +++ b/python/TestHarness/tests/test_FailedTests.py @@ -8,6 +8,7 @@ #* https://www.gnu.org/licenses/lgpl-2.1.html import subprocess +import tempfile from TestHarnessTestCase import TestHarnessTestCase class TestHarnessTester(TestHarnessTestCase): @@ -17,21 +18,24 @@ def testFailedTests(self): to create a json file containing previous results, and again to only run the test which that has failed. """ - with self.assertRaises(subprocess.CalledProcessError) as cm: - self.runTests('--no-color', '-i', 'always_bad', '--results-file', 'failed-unittest') + with tempfile.TemporaryDirectory() as output_dir: + args = ['--no-color', '--results-file', 'failed-unittest', '-o', output_dir] + kwargs = {'tmp_output': False} + with self.assertRaises(subprocess.CalledProcessError) as cm: + self.runTests(*args, '-i', 'always_bad', **kwargs) - e = cm.exception + e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.always_ok.*?OK') - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.always_bad.*?FAILED \(CODE 1\)') + self.assertRegex(e.output, r'tests/test_harness.always_ok.*?OK') + self.assertRegex(e.output, r'tests/test_harness.always_bad.*?FAILED \(CODE 1\)') - with self.assertRaises(subprocess.CalledProcessError) as cm: - self.runTests('--no-color', '--failed-tests', '--results-file', 'failed-unittest') + with self.assertRaises(subprocess.CalledProcessError) as cm: + self.runTests(*args, '--failed-tests', **kwargs) - e = cm.exception + e = cm.exception - # Verify the passing test is not present - self.assertNotRegex(e.output.decode('utf-8'), r'tests/test_harness.always_ok.*?OK') + # Verify the passing test is not present + self.assertNotRegex(e.output, r'tests/test_harness.always_ok.*?OK') - # Verify the caveat represents a previous result - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.always_bad.*?\[PREVIOUS RESULTS: CODE 1\] FAILED \(CODE 1\)') + # Verify the caveat represents a previous result + self.assertRegex(e.output, r'tests/test_harness.always_bad.*?\[PREVIOUS RESULTS: CODE 1\] FAILED \(CODE 1\)') diff --git a/python/TestHarness/tests/test_Ignore.py b/python/TestHarness/tests/test_Ignore.py index 3f2c056f872f..1ff2c27bef64 100644 --- a/python/TestHarness/tests/test_Ignore.py +++ b/python/TestHarness/tests/test_Ignore.py @@ -16,7 +16,7 @@ def testIgnoreSkip(self): """ # Run a skipped test output = self.runTests('-i', 'ignore_skipped', '--ignore', 'skip') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_skipped.*?OK') + self.assertRegex(output, 'test_harness\.ignore_skipped.*?OK') def testIgnoreHeavy(self): """ @@ -24,7 +24,7 @@ def testIgnoreHeavy(self): """ # Run a skipped heavy test output = self.runTests('-i', 'ignore_heavy', '--ignore', 'heavy') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_heavy.*?OK') + self.assertRegex(output, 'test_harness\.ignore_heavy.*?OK') def testIgnoreCompiler(self): """ @@ -33,7 +33,7 @@ def testIgnoreCompiler(self): """ # Run a skipped compiler test output = self.runTests('-i', 'ignore_compiler', '--ignore', 'compiler') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_compiler.*?OK') + self.assertRegex(output, 'test_harness\.ignore_compiler.*?OK') def testIgnorePlatform(self): """ @@ -42,7 +42,7 @@ def testIgnorePlatform(self): """ # Run a skipped platform test output = self.runTests('-i', 'ignore_platform', '--ignore', 'platform') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_platform.*?OK') + self.assertRegex(output, 'test_harness\.ignore_platform.*?OK') def testIgnorePreReq(self): """ @@ -51,13 +51,13 @@ def testIgnorePreReq(self): """ # Run a skipped prereq test output = self.runTests('--no-color', '-i', 'ignore_prereq', '--ignore', 'prereq') - self.assertRegex(output.decode('utf-8'), 'test_harness\.always_skipped.*? \[ALWAYS SKIPPED\] SKIP') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_skipped_dependency.*?OK') + self.assertRegex(output, 'test_harness\.always_skipped.*? \[ALWAYS SKIPPED\] SKIP') + self.assertRegex(output, 'test_harness\.ignore_skipped_dependency.*?OK') # Check that a dependency test runs when its prereq test is skipped output = self.runTests('--no-color', '-i', 'ignore_prereq', '--ignore', 'skip') - self.assertRegex(output.decode('utf-8'), 'test_harness\.always_skipped.*?OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_skipped_dependency.*?OK') + self.assertRegex(output, 'test_harness\.always_skipped.*?OK') + self.assertRegex(output, 'test_harness\.ignore_skipped_dependency.*?OK') def testIgnoreMultiple(self): """ @@ -66,7 +66,7 @@ def testIgnoreMultiple(self): """ # Run a multiple caveat skipped test by manually supplying each caveat output = self.runTests('-i', 'ignore_multiple', '--ignore', 'skip heavy compiler platform') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_multiple.*?OK') + self.assertRegex(output, 'test_harness\.ignore_multiple.*?OK') def testIgnoreAll(self): """ @@ -75,7 +75,7 @@ def testIgnoreAll(self): """ # Run a multiple caveat skipped test using built in default 'all' output = self.runTests('-i', 'ignore_multiple', '--ignore') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_multiple.*?OK') + self.assertRegex(output, 'test_harness\.ignore_multiple.*?OK') def testIgnoreMissingOne(self): """ @@ -84,7 +84,7 @@ def testIgnoreMissingOne(self): """ # Skip a multiple caveat test by not supplying enough caveats to ignore output = self.runTests('--no-color', '-i', 'ignore_multiple', '--ignore', 'skip heavy compiler') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_multiple.*? \[PLATFORM!=NON_EXISTENT\] SKIP') + self.assertRegex(output, 'test_harness\.ignore_multiple.*? \[PLATFORM!=NON_EXISTENT\] SKIP') def testIgnoreMultiplePreReq(self): """ @@ -93,27 +93,27 @@ def testIgnoreMultiplePreReq(self): """ # Run a multiple caveat prereq test using built in default 'all' output = self.runTests('-i', 'ignore_multiple_prereq', '--ignore') - self.assertRegex(output.decode('utf-8'), 'test_harness\.always_skipped.*?OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_multi_prereq_dependency.*?OK') + self.assertRegex(output, 'test_harness\.always_skipped.*?OK') + self.assertRegex(output, 'test_harness\.ignore_multi_prereq_dependency.*?OK') # Run a multiple caveat prereq test by manually supplying each caveat output = self.runTests('-i', 'ignore_multiple_prereq', '--ignore', 'prereq skip heavy compiler platform') - self.assertRegex(output.decode('utf-8'), 'test_harness\.always_skipped.*?OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_multi_prereq_dependency.*?OK') + self.assertRegex(output, 'test_harness\.always_skipped.*?OK') + self.assertRegex(output, 'test_harness\.ignore_multi_prereq_dependency.*?OK') # Skip a multiple caveat prereq test by not supplying enough caveats to ignore output = self.runTests('--no-color', '-i', 'ignore_multiple_prereq', '--ignore', 'prereq skip heavy compiler') - self.assertRegex(output.decode('utf-8'), 'test_harness\.always_skipped.*?OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_multi_prereq_dependency.*? \[PLATFORM!=NON_EXISTENT\] SKIP') + self.assertRegex(output, 'test_harness\.always_skipped.*?OK') + self.assertRegex(output, 'test_harness\.ignore_multi_prereq_dependency.*? \[PLATFORM!=NON_EXISTENT\] SKIP') # Check that a multiple caveat dependency test runs when its prereq test is skipped # This test may seem redundant, but `prereq` is handled differently than the other caveats output = self.runTests('--no-color', '-i', 'ignore_multiple_prereq', '--ignore', 'prereq heavy compiler platform') - self.assertRegex(output.decode('utf-8'), 'test_harness\.always_skipped.*? \[ALWAYS SKIPPED\] SKIP') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_multi_prereq_dependency.*?OK') + self.assertRegex(output, 'test_harness\.always_skipped.*? \[ALWAYS SKIPPED\] SKIP') + self.assertRegex(output, 'test_harness\.ignore_multi_prereq_dependency.*?OK') # Check that by supplying a very specific set of ignored paramaters, we # can properly trigger a skipped dependency scenario output = self.runTests('--no-color', '-i', 'ignore_multiple_prereq', '--ignore', 'heavy compiler platform') - self.assertRegex(output.decode('utf-8'), 'test_harness\.always_skipped.*? \[ALWAYS SKIPPED\] SKIP') - self.assertRegex(output.decode('utf-8'), 'test_harness\.ignore_multi_prereq_dependency.*? \[SKIPPED DEPENDENCY\] SKIP') + self.assertRegex(output, 'test_harness\.always_skipped.*? \[ALWAYS SKIPPED\] SKIP') + self.assertRegex(output, 'test_harness\.ignore_multi_prereq_dependency.*? \[SKIPPED DEPENDENCY\] SKIP') diff --git a/python/TestHarness/tests/test_InstallType.py b/python/TestHarness/tests/test_InstallType.py index 16796b0388d4..1a2ae7e047cf 100644 --- a/python/TestHarness/tests/test_InstallType.py +++ b/python/TestHarness/tests/test_InstallType.py @@ -7,7 +7,7 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html -import os, sys, io +import os, io import unittest import mock import TestHarness @@ -21,7 +21,8 @@ def mocked_output(self, mocked, expect_fail, mocked_return): out = io.StringIO() with redirect_stdout(out): mocked_return.return_value=mocked - harness = TestHarness.TestHarness(['', '-i', 'install_type', '-c'], MOOSE_DIR) + cmd = ['', '-i', 'install_type', '-c', '--term-format', 'njCst'] + harness = TestHarness.TestHarness(cmd, MOOSE_DIR) if expect_fail: with self.assertRaises(SystemExit): harness.findAndRunTests() @@ -34,15 +35,15 @@ def testInstalled(self): Test which only runs if binary is installed """ out = self.mocked_output(set(['ALL', 'INSTALLED']), False) - self.assertRegex(out, r'.*?SKIP.*?in_tree_type.*?"IN_TREE" binary]') - self.assertRegex(out, r'.*?OK.*?installed_type') - self.assertRegex(out, r'.*?OK.*?all_type') + self.assertRegex(out, r'tests\/test_harness\.in_tree_type[\s.]+\[TEST REQUIRES "IN_TREE" BINARY\]\s+SKIP') + self.assertRegex(out, r'tests\/test_harness\.installed_type[\s.]+OK') + self.assertRegex(out, r'tests\/test_harness\.all_type[\s.]+OK') def testInTree(self): """ Test which only runs if binary is in_tree """ out = self.mocked_output(set(['ALL', 'IN_TREE']), False) - self.assertRegex(out, r'.*?SKIP.*?installed_type.*?"INSTALLED" binary]') - self.assertRegex(out, r'.*?OK.*?in_tree_type') - self.assertRegex(out, r'.*?OK.*?all_type') + self.assertRegex(out, r'tests\/test_harness\.in_tree_type[\s.]+OK') + self.assertRegex(out, r'tests\/test_harness\.installed_type[\s.]+\[TEST REQUIRES "INSTALLED" BINARY\]\s+SKIP') + self.assertRegex(out, r'tests\/test_harness\.all_type[\s.]+OK') diff --git a/python/TestHarness/tests/test_LongRunning.py b/python/TestHarness/tests/test_LongRunning.py index 8a00eabc17d6..69a74ea12a81 100644 --- a/python/TestHarness/tests/test_LongRunning.py +++ b/python/TestHarness/tests/test_LongRunning.py @@ -14,6 +14,6 @@ def testLongRunningStatus(self): """ Test for RUNNING status in the TestHarness """ - output = self.runTests('-i', 'long_running').decode('utf-8') + output = self.runTests('-i', 'long_running') self.assertIn('RUNNING', output) self.assertIn('[FINISHED]', output) diff --git a/python/TestHarness/tests/test_LongestJobs.py b/python/TestHarness/tests/test_LongestJobs.py index d324dcd8fd23..fef405086faa 100644 --- a/python/TestHarness/tests/test_LongestJobs.py +++ b/python/TestHarness/tests/test_LongestJobs.py @@ -18,7 +18,7 @@ def testLongestJobs(self): with self.assertRaises(subprocess.CalledProcessError) as cm: self.runTests('-i', 'longest_jobs', '--longest-jobs', '4') - output = cm.exception.output.decode('utf-8') + output = cm.exception.output self.assertIn('4 longest running jobs', output) self.assertRegex(output, r'(?s)longest running jobs.*run_1') @@ -30,7 +30,7 @@ def testLongestJobsNoneCompleted(self): """ Test for --longest-jobs in the TestHarness with no jobs ran. """ - output = self.runTests('-i', 'longest_jobs', '--re', 'foo', '--longest-jobs', '100').decode('utf-8') + output = self.runTests('-i', 'longest_jobs', '--re', 'foo', '--longest-jobs', '100') self.assertIn('100 longest running jobs', output) self.assertNotRegex(output, r'(?s)longest running jobs.*') diff --git a/python/TestHarness/tests/test_MachineType.py b/python/TestHarness/tests/test_MachineType.py index ac4026f4db35..10508d8a0c06 100644 --- a/python/TestHarness/tests/test_MachineType.py +++ b/python/TestHarness/tests/test_MachineType.py @@ -7,7 +7,7 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html -import os, sys, io +import os, io import unittest import mock import TestHarness @@ -21,7 +21,8 @@ def mocked_output(self, mocked, expect_fail, mocked_return): out = io.StringIO() with redirect_stdout(out): mocked_return.return_value=mocked - harness = TestHarness.TestHarness(['', '-i', 'always_ok', '-c'], MOOSE_DIR) + cmd = ['', '-i', 'always_ok', '-c', '--term-format', 'njCst'] + harness = TestHarness.TestHarness(cmd, MOOSE_DIR) if expect_fail: with self.assertRaises(SystemExit): harness.findAndRunTests() @@ -34,11 +35,11 @@ def testNotSkipped(self): Test should not be skipped, as it is set to run on any arch (ALL) """ out = self.mocked_output(set(['ALL']), False) - self.assertRegex(out, r'.*?OK.*?always_ok') + self.assertRegex(out, r'tests\/test_harness\.always_ok[\s.]+OK') def testSkipped(self): """ Test that a non existing machine type is skipped (remove default of ALL) """ out = self.mocked_output(set(['']), False) - self.assertRegex(out, r'.*?SKIP.*?always_ok.*?MACHINE!=ALL') + self.assertRegex(out, r'tests\/test_harness\.always_ok[\s.]+\[MACHINE!=ALL\]\s+SKIP') diff --git a/python/TestHarness/tests/test_MinADSize.py b/python/TestHarness/tests/test_MinADSize.py index a160eede58b7..8ef6b69e0074 100644 --- a/python/TestHarness/tests/test_MinADSize.py +++ b/python/TestHarness/tests/test_MinADSize.py @@ -15,6 +15,6 @@ def testMinADSize(self): """ Test AD vector size """ - output = self.runTests('-i', 'ad_size', '--no-color').decode('utf-8') + output = self.runTests('-i', 'ad_size', '--no-color') self.assertRegex(output, r'tests/test_harness.enough \.* OK') self.assertRegex(output, r'tests/test_harness\.too_few \.* \[MINIMUM AD SIZE 1000 NEEDED, BUT MOOSE IS CONFIGURED WITH \d+\] SKIP') diff --git a/python/TestHarness/tests/test_MissingGold.py b/python/TestHarness/tests/test_MissingGold.py index 7e23f5929074..6b2235d44d49 100644 --- a/python/TestHarness/tests/test_MissingGold.py +++ b/python/TestHarness/tests/test_MissingGold.py @@ -19,8 +19,8 @@ def testMissingGold(self): self.runTests('-i', 'missing_gold') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), 'test_harness\.exodiff.*?FAILED \(MISSING GOLD FILE\)') - self.assertRegex(e.output.decode('utf-8'), 'test_harness\.csvdiff.*?FAILED \(MISSING GOLD FILE\)') + self.assertRegex(e.output, 'test_harness\.exodiff.*?FAILED \(MISSING GOLD FILE\)') + self.assertRegex(e.output, 'test_harness\.csvdiff.*?FAILED \(MISSING GOLD FILE\)') # Verify return code is a general failure related (0x80) self.assertIs(0x80, e.returncode) diff --git a/python/TestHarness/tests/test_OutputInterface.py b/python/TestHarness/tests/test_OutputInterface.py new file mode 100644 index 000000000000..8587d4c84466 --- /dev/null +++ b/python/TestHarness/tests/test_OutputInterface.py @@ -0,0 +1,125 @@ +#* This file is part of the MOOSE framework +#* https://www.mooseframework.org +#* +#* All rights reserved, see COPYRIGHT for full restrictions +#* https://github.com/idaholab/moose/blob/master/COPYRIGHT +#* +#* Licensed under LGPL 2.1, please see LICENSE for details +#* https://www.gnu.org/licenses/lgpl-2.1.html + +import unittest +import tempfile +import os + +from TestHarness import OutputInterface + +class TestHarnessTester(unittest.TestCase): + def testInMemory(self): + # Empty state + oi = OutputInterface() + self.assertIsNone(oi.getSeparateOutputFilePath()) + self.assertFalse(oi.hasOutput()) + self.assertEqual(oi.output, '') + self.assertEqual(oi.getOutput(), '') + + # Add output + output = 'foo' + oi.setOutput(output) + self.assertIsNone(oi.getSeparateOutputFilePath()) + self.assertTrue(oi.hasOutput()) + self.assertEqual(oi.getOutput(), output) + + # Clear output + oi.clearOutput() + self.assertFalse(oi.hasOutput()) + + # Append output empty + output = 'bar' + oi.appendOutput(output) + self.assertTrue(oi.hasOutput()) + self.assertEqual(oi.getOutput(), output) + + # Append more + oi.appendOutput(output) + self.assertTrue(oi.hasOutput()) + self.assertEqual(oi.getOutput(), output + output) + + # Reset + output = 'foo' + oi.setOutput(output) + self.assertTrue(oi.hasOutput()) + self.assertEqual(oi.getOutput(), output) + # And then append + for i in range(2): + oi.appendOutput(output) + self.assertTrue(oi.hasOutput()) + self.assertEqual(oi.getOutput(), output * 3) + + def testSeparate(self): + with tempfile.TemporaryDirectory() as dir: + output_file = os.path.join(dir, 'output') + + # Empty state + oi = OutputInterface() + oi.setSeparateOutputPath(output_file) + self.assertEqual(oi.getSeparateOutputFilePath(), output_file) + self.assertFalse(os.path.exists(output_file)) + self.assertFalse(oi.hasOutput()) + self.assertEqual(oi.output, '') + self.assertEqual(oi.getOutput(), '') + + # Add output + output = 'foo' + oi.setOutput(output) + self.assertTrue(os.path.exists(output_file)) + self.assertTrue(oi.hasOutput()) + self.assertEqual(oi.getOutput(), output) + + # Clear output + oi.clearOutput() + self.assertFalse(os.path.exists(output_file)) + self.assertFalse(oi.hasOutput()) + + # Append output empty + output = 'bar' + oi.appendOutput(output) + self.assertTrue(os.path.exists(output_file)) + self.assertTrue(oi.hasOutput()) + self.assertEqual(oi.getOutput(), output) + + # Append more + oi.appendOutput(output) + self.assertTrue(oi.hasOutput()) + self.assertTrue(os.path.exists(output_file)) + self.assertEqual(oi.getOutput(), output + output) + + # Reset + output = 'foo' + oi.setOutput(output) + self.assertTrue(os.path.exists(output_file)) + self.assertTrue(oi.hasOutput()) + self.assertEqual(oi.getOutput(), output) + # And then append + for i in range(2): + oi.appendOutput(output) + self.assertTrue(oi.hasOutput()) + self.assertEqual(oi.getOutput(), output * 3) + + def testBadOutput(self): + oi = OutputInterface() + + null_chars = 'foobar\nwith a dirty \0and another dirty\x00' + null_replaced = null_chars.replace('\0', 'NULL').replace('\x00', 'NULL') + + # Set null characters + oi.setOutput(null_chars) + failures = oi.sanitizeOutput() + self.assertEqual(failures, ['NULL output']) + self.assertEqual(oi.getOutput(), null_replaced) + + # Set null characters without sanitize + oi.setOutput(null_chars) + with self.assertRaises(OutputInterface.BadOutputException) as e: + oi.getOutput() + self.assertEqual(e.exception.errors, ['NULL output']) + self.assertEqual(str(e.exception), 'Bad output detected: NULL output') diff --git a/python/TestHarness/tests/test_PBS.py b/python/TestHarness/tests/test_PBS.py deleted file mode 100644 index 9710fc63aa9a..000000000000 --- a/python/TestHarness/tests/test_PBS.py +++ /dev/null @@ -1,56 +0,0 @@ -#* This file is part of the MOOSE framework -#* https://www.mooseframework.org -#* -#* All rights reserved, see COPYRIGHT for full restrictions -#* https://github.com/idaholab/moose/blob/master/COPYRIGHT -#* -#* Licensed under LGPL 2.1, please see LICENSE for details -#* https://www.gnu.org/licenses/lgpl-2.1.html - -import subprocess, unittest, os -from TestHarnessTestCase import TestHarnessTestCase - -def checkQstat(): - try: - if subprocess.call(['qstat']) == 0: - return True - except: - pass - -@unittest.skipIf(checkQstat() != True, "PBS not available") -class TestHarnessTester(TestHarnessTestCase): - """ - Test general PBS functionality. There are some caveats however: - - We cannot test the output of specific test. Only the initial launch return code. This - is because launching qsub is a background process, and we have no idea when that job - is finished. Or if it even began (perhaps the job is queued). - """ - def setUp(self): - """ - setUp occurs before every test. Clean up previous results file - """ - pbs_results_file = os.path.join(os.getenv('MOOSE_DIR'), 'test', '_testPBS') - - # File will not exist on the first run - try: - os.remove(pbs_results_file) - except: - pass - - def testPBSQueue(self): - """ - Test argument '--pbs-queue does-not-exist' fails, as this queue should not exist - """ - with self.assertRaises(subprocess.CalledProcessError) as cm: - self.runTests('--pbs-queue', 'does-not-exist', '--pbs', '_testPBS', '-i', 'always_ok') - - e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'ERROR: qsub: Unknown queue') - - def testPBSLaunch(self): - """ - Test general launch command - """ - output = self.runTests('--pbs', '_testPBS', '-i', 'always_ok').decode('utf-8') - self.assertNotIn('LAUNCHED', output) diff --git a/python/TestHarness/tests/test_ParserErrors.py b/python/TestHarness/tests/test_ParserErrors.py index 6cd57851cc3d..907d6a7c25ff 100644 --- a/python/TestHarness/tests/test_ParserErrors.py +++ b/python/TestHarness/tests/test_ParserErrors.py @@ -17,5 +17,5 @@ def testSyntax(self): # check that parser errors print correctly # TODO: Are there more we can test? - output = self.runExceptionTests('-i', 'parse_errors').decode('utf-8') + output = self.runExceptionTests('-i', 'parse_errors') self.assertIn('duplicate parameter', output) diff --git a/python/TestHarness/tests/test_PythonVersion.py b/python/TestHarness/tests/test_PythonVersion.py index e3f9e4002944..e17871caf41c 100644 --- a/python/TestHarness/tests/test_PythonVersion.py +++ b/python/TestHarness/tests/test_PythonVersion.py @@ -12,7 +12,7 @@ class TestHarnessTester(TestHarnessTestCase): def testVersion(self): """Test that python=... is working.""" - output = self.runTests('-i', 'python_version').decode('utf-8') + output = self.runTests('-i', 'python_version') self.assertIn('[PYTHON != 2]', output) self.assertIn('[PYTHON != 3.5]', output) self.assertIn('[PYTHON != 3.4.1]', output) diff --git a/python/TestHarness/tests/test_RaceConditions.py b/python/TestHarness/tests/test_RaceConditions.py index f2a4922f132f..806b3192f448 100644 --- a/python/TestHarness/tests/test_RaceConditions.py +++ b/python/TestHarness/tests/test_RaceConditions.py @@ -20,4 +20,4 @@ def testRaceConditions(self): with self.assertRaises(subprocess.CalledProcessError) as cm: self.runTests('--pedantic-checks', '-i', 'output_clobber_simple') e = cm.exception - self.assertIn('Diagnostic analysis', e.output.decode('utf-8')) + self.assertIn('Diagnostic analysis', e.output) diff --git a/python/TestHarness/tests/test_Recover.py b/python/TestHarness/tests/test_Recover.py index d6a0a445752a..de8c8f7c725c 100644 --- a/python/TestHarness/tests/test_Recover.py +++ b/python/TestHarness/tests/test_Recover.py @@ -15,7 +15,7 @@ def testRecover(self): """ Test that --recover returns two passing statuses (part1 and the OK) """ - output = self.runTests('-i', 'always_ok', '--recover').decode('utf-8') + output = self.runTests('-i', 'always_ok', '--recover') self.assertIn('PART1', output) self.assertIn('RECOVER', output) @@ -30,8 +30,8 @@ def testRecoverPart1Fail(self): Test that --recover still checks status on Part1 tests """ with self.assertRaises(subprocess.CalledProcessError) as cm: - self.runTests('-i', 'exception_transient', '--recover').decode('utf-8') + self.runTests('-i', 'exception_transient', '--recover') e = cm.exception - output = e.output.decode('utf-8') + output = e.output self.assertRegex(output, r'test_harness.*?part1.*?FAILED \(CRASH\)') diff --git a/python/TestHarness/tests/test_Replay.py b/python/TestHarness/tests/test_Replay.py index 6e9c9504a24d..ab044e73b0a0 100644 --- a/python/TestHarness/tests/test_Replay.py +++ b/python/TestHarness/tests/test_Replay.py @@ -11,6 +11,7 @@ import re import subprocess import shutil +import tempfile from TestHarnessTestCase import TestHarnessTestCase @@ -40,30 +41,40 @@ def reCompile(self): def testReplay(self): """ Test ability to replay back previous run results """ - output_a = self.runTests('--verbose', '--timing', '-i', 'always_ok', '--results-file', 'unittest_Replay') - output_b = self.runTests('--verbose', '--timing', '--show-last-run', '--results-file', 'unittest_Replay') - compile = self.reCompile() - formated_a = compile.findall(str(output_a)) - formated_b = compile.findall(str(output_b)) + with tempfile.TemporaryDirectory() as output_dir: + base_args = ['--verbose', '-c', '--timing', '--results-file', 'unittest_Replay', '-o', output_dir] + base_kwargs = {'tmp_output': False} + output_a = self.runTests(*base_args, '-i', 'always_ok', **base_kwargs) + output_b = self.runTests(*base_args, '--show-last-run', **base_kwargs) - if formated_a != formated_b: - self.fail(f'--show-last-run did not match last run\n\n{formated_a}\n\n{formated_b}') + # The only difference should be the total run time, so replace the run time + # from the first with the run time from the second + def parseSummary(output): + search = re.search(r'Ran (\d+) tests in (\d+.\d+) seconds', output) + self.assertTrue(search is not None) + return int(search.group(1)), float(search.group(2)) + num_tests, total_time = parseSummary(output_a) + other_num_tests, other_total_time = parseSummary(output_b) + self.assertEqual(num_tests, other_num_tests) + output_b = output_b.replace(f'Ran {num_tests} tests in {other_total_time} seconds', + f'Ran {num_tests} tests in {total_time} seconds') + self.assertEqual(output_a, output_b) def testDiffReplay(self): """ Verify that the feature fails when asked to capture new output """ - output_a = self.runTests('--verbose', '--timing', '-i', 'always_ok', '--results-file', 'unittest_Replay') - # --re=doesenotexist will produce no output (or rather different output than the above) - output_b = self.runTests('--verbose', '--timing', '--show-last-run', '--results-file', 'unittest_Replay', '--re=doesnotexist') - compile = self.reCompile() - formated_a = compile.findall(str(output_a)) - formated_b = compile.findall(str(output_b)) - - if formated_a == formated_b: - self.fail(f'--show-last-run matched when it should not have') + with tempfile.TemporaryDirectory() as output_dir: + base_args = ['--verbose', '--timing', '--results-file', 'unittest_Replay', '-o', output_dir] + base_kwargs = {'tmp_output': False} + output_a = self.runTests(*base_args, '-i', 'always_ok', **base_kwargs) + # --re=doesenotexist will produce no output (or rather different output than the above) + output_b = self.runTests(*base_args, '--show-last-run', '--re=doesnotexist', **base_kwargs) + self.assertIn('Ran 1 tests in', output_a) + self.assertIn('Ran 0 tests in', output_b) def testNoResultsFile(self): """ Verify the TestHarness errors correctly when there is no results file to work with """ - with self.assertRaises(subprocess.CalledProcessError) as cm: - self.runTests('--show-last-run', '--results-file', 'non_existent') - e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'A previous run does not exist') + with tempfile.TemporaryDirectory() as output_dir: + with self.assertRaises(subprocess.CalledProcessError) as cm: + self.runTests('--show-last-run', '--results-file', 'non_existent', '-o', output_dir, tmp_output=False) + e = cm.exception + self.assertIn(f'The previous run {output_dir}/non_existent does not exist', e.output) diff --git a/python/TestHarness/tests/test_ReportSkipped.py b/python/TestHarness/tests/test_ReportSkipped.py index 83e2b5c085bb..c6f1e53043df 100644 --- a/python/TestHarness/tests/test_ReportSkipped.py +++ b/python/TestHarness/tests/test_ReportSkipped.py @@ -16,9 +16,9 @@ def testSyntax(self): """ # Verify the skipped test _does_ appear - output = self.runExceptionTests('--no-color', '-i', 'ignore_skipped').decode('utf-8') + output = self.runTests('--no-color', '-i', 'ignore_skipped') self.assertIn('[ALWAYS SKIPPED] SKIP', output) # Verify the skipped test does _not_ appear - output = self.runTests('--no-color', '--no-report', '-i', 'ignore_skipped').decode('utf-8') + output = self.runTests('--no-color', '--no-report', '-i', 'ignore_skipped') self.assertNotIn('[ALWAYS SKIPPED] SKIP', output) diff --git a/python/TestHarness/tests/test_RequiredApps.py b/python/TestHarness/tests/test_RequiredApps.py index 90b5546a38d5..377e2015908f 100644 --- a/python/TestHarness/tests/test_RequiredApps.py +++ b/python/TestHarness/tests/test_RequiredApps.py @@ -15,6 +15,6 @@ def testRequiredApps(self): Test that the required_apps check works """ output = self.runTests('--no-color', '-i', 'required_apps') - self.assertRegex(output.decode('utf-8'), r'test_harness\.bad_app.*? \[APP DOESNOTEXIST NOT REGISTERED IN EXECUTABLE\] SKIP') - self.assertRegex(output.decode('utf-8'), r'test_harness\.good_app.*? OK') - self.checkStatus(output.decode('utf-8'), passed=1, skipped=1) + self.assertRegex(output, r'test_harness\.bad_app.*? \[APP DOESNOTEXIST NOT REGISTERED IN EXECUTABLE\] SKIP') + self.assertRegex(output, r'test_harness\.good_app.*? OK') + self.checkStatus(output, passed=1, skipped=1) diff --git a/python/TestHarness/tests/test_RequiredObjects.py b/python/TestHarness/tests/test_RequiredObjects.py index 38987a18fa2b..66aa5cb09e04 100644 --- a/python/TestHarness/tests/test_RequiredObjects.py +++ b/python/TestHarness/tests/test_RequiredObjects.py @@ -15,6 +15,6 @@ def testRequiredObjects(self): Test that the required_objects check works """ output = self.runTests('--no-color', '-i', 'required_objects') - self.assertRegex(output.decode('utf-8'), r'test_harness\.bad_object.*? \[DOESNOTEXIST NOT FOUND IN EXECUTABLE\] SKIP') - self.assertRegex(output.decode('utf-8'), r'test_harness\.good_objects.*? OK') - self.checkStatus(output.decode('utf-8'), passed=1, skipped=1) + self.assertRegex(output, r'test_harness\.bad_object.*? \[DOESNOTEXIST NOT FOUND IN EXECUTABLE\] SKIP') + self.assertRegex(output, r'test_harness\.good_objects.*? OK') + self.checkStatus(output, passed=1, skipped=1) diff --git a/python/TestHarness/tests/test_SchemaDiff.py b/python/TestHarness/tests/test_SchemaDiff.py index 8a8c646ddf90..836ff2c3380b 100644 --- a/python/TestHarness/tests/test_SchemaDiff.py +++ b/python/TestHarness/tests/test_SchemaDiff.py @@ -13,7 +13,7 @@ class TestHarnessTester(TestHarnessTestCase): def testSchemaDiff(self): output = self.runExceptionTests('-i', 'schemadiff') - self.assertRegex(output.decode('utf-8'), r'test_harness\.schema_jsondiff.*?FAILED \(SCHEMADIFF\)') - self.assertRegex(output.decode('utf-8'), r'test_harness\.schema_xmldiff.*?FAILED \(SCHEMADIFF\)') - self.assertRegex(output.decode('utf-8'), r'test_harness\.schema_invalid_json.*?FAILED \(LOAD FAILED\)') - self.assertRegex(output.decode('utf-8'), r'test_harness\.schema_invalid_xml.*?FAILED \(LOAD FAILED\)') + self.assertRegex(output, r'test_harness\.schema_jsondiff.*?FAILED \(SCHEMADIFF\)') + self.assertRegex(output, r'test_harness\.schema_xmldiff.*?FAILED \(SCHEMADIFF\)') + self.assertRegex(output, r'test_harness\.schema_invalid_json.*?FAILED \(LOAD FAILED\)') + self.assertRegex(output, r'test_harness\.schema_invalid_xml.*?FAILED \(LOAD FAILED\)') diff --git a/python/TestHarness/tests/test_ShouldExecute.py b/python/TestHarness/tests/test_ShouldExecute.py index a174b92ac3ad..9940bca5b07b 100644 --- a/python/TestHarness/tests/test_ShouldExecute.py +++ b/python/TestHarness/tests/test_ShouldExecute.py @@ -20,6 +20,6 @@ def testShouldExecute(self): self.runTests('-i', 'should_execute') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.should_execute_true_ok.*?OK') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.should_execute_false_ok.*?OK') - self.assertRegex(e.output.decode('utf-8'), r'test_harness\.should_execute_true_fail.*?FAILED \(EXODIFF\)') + self.assertRegex(e.output, r'test_harness\.should_execute_true_ok.*?OK') + self.assertRegex(e.output, r'test_harness\.should_execute_false_ok.*?OK') + self.assertRegex(e.output, r'test_harness\.should_execute_true_fail.*?FAILED \(EXODIFF\)') diff --git a/python/TestHarness/tests/test_SoftHeavyDependency.py b/python/TestHarness/tests/test_SoftHeavyDependency.py index 718d46e63762..4c9f05475956 100644 --- a/python/TestHarness/tests/test_SoftHeavyDependency.py +++ b/python/TestHarness/tests/test_SoftHeavyDependency.py @@ -16,31 +16,31 @@ def testNotHeavy(self): """ output = self.runTests('--no-color', '-i', 'heavy_on_not_heavy') # The following should be skipped - self.assertRegex(output.decode('utf-8'), 'test_harness\.heavy_a .*? \[HEAVY\] SKIP') - self.assertRegex(output.decode('utf-8'), 'test_harness\.heavy_b .*? \[HEAVY\] SKIP') - self.assertRegex(output.decode('utf-8'), 'test_harness\.heavy_on_not_heavy .*? \[HEAVY\] SKIP') - self.assertRegex(output.decode('utf-8'), 'test_harness\.heavy_on_not_heavy_a_and_not_heavy_b .*? \[HEAVY\] SKIP') + self.assertRegex(output, 'test_harness\.heavy_a .*? \[HEAVY\] SKIP') + self.assertRegex(output, 'test_harness\.heavy_b .*? \[HEAVY\] SKIP') + self.assertRegex(output, 'test_harness\.heavy_on_not_heavy .*? \[HEAVY\] SKIP') + self.assertRegex(output, 'test_harness\.heavy_on_not_heavy_a_and_not_heavy_b .*? \[HEAVY\] SKIP') # The following should not be skipped, they should finish with an OK status. - self.assertRegex(output.decode('utf-8'), 'test_harness\.singleton_a .*? OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.singleton_b .*? OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.not_heavy .*? OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.not_heavy_a .*? OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.not_heavy_b .*? OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.not_heavy_on_singleton_a_and_singleton_b .*? OK') + self.assertRegex(output, 'test_harness\.singleton_a .*? OK') + self.assertRegex(output, 'test_harness\.singleton_b .*? OK') + self.assertRegex(output, 'test_harness\.not_heavy .*? OK') + self.assertRegex(output, 'test_harness\.not_heavy_a .*? OK') + self.assertRegex(output, 'test_harness\.not_heavy_b .*? OK') + self.assertRegex(output, 'test_harness\.not_heavy_on_singleton_a_and_singleton_b .*? OK') # The following should run, and should not list [implict heavy] caveat. # (a little redundant, but I don't see a way to check for this and the OK test above, in one go) - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.singleton_a .*? \[IMPLICT HEAVY\] OK') - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.singleton_b .*? \[IMPLICT HEAVY\] OK') - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.not_heavy .*? \[IMPLICT HEAVY\] OK') - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.not_heavy_a .*? \[IMPLICT HEAVY\] OK') - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.not_heavy_b .*? \[IMPLICT HEAVY\] OK') - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.not_heavy_on_singleton_a_and_singleton_b .*? \[IMPLICT HEAVY\] OK') + self.assertNotRegex(output, 'test_harness\.singleton_a .*? \[IMPLICT HEAVY\] OK') + self.assertNotRegex(output, 'test_harness\.singleton_b .*? \[IMPLICT HEAVY\] OK') + self.assertNotRegex(output, 'test_harness\.not_heavy .*? \[IMPLICT HEAVY\] OK') + self.assertNotRegex(output, 'test_harness\.not_heavy_a .*? \[IMPLICT HEAVY\] OK') + self.assertNotRegex(output, 'test_harness\.not_heavy_b .*? \[IMPLICT HEAVY\] OK') + self.assertNotRegex(output, 'test_harness\.not_heavy_on_singleton_a_and_singleton_b .*? \[IMPLICT HEAVY\] OK') # Special: caveat placements are random. Only check that it is skipped. # [skipped dependency,HEAVY] SKIP versus [HEAVY,skipped dependency] SKIP - self.assertRegex(output.decode('utf-8'), 'test_harness\.heavy_on_heavy_a_and_heavy_b .*?SKIP') + self.assertRegex(output, 'test_harness\.heavy_on_heavy_a_and_heavy_b .*?SKIP') def testSoftHeavy(self): """ @@ -51,25 +51,25 @@ def testSoftHeavy(self): """ output = self.runTests('--no-color', '-i', 'heavy_on_not_heavy', '--heavy') # The following should run, and mention the additional [implicit heavy] caveat. - self.assertRegex(output.decode('utf-8'), 'test_harness\.not_heavy .*? \[IMPLICIT HEAVY\] OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.not_heavy_a .*? \[IMPLICIT HEAVY\] OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.not_heavy_b .*? \[IMPLICIT HEAVY\] OK') + self.assertRegex(output, 'test_harness\.not_heavy .*? \[IMPLICIT HEAVY\] OK') + self.assertRegex(output, 'test_harness\.not_heavy_a .*? \[IMPLICIT HEAVY\] OK') + self.assertRegex(output, 'test_harness\.not_heavy_b .*? \[IMPLICIT HEAVY\] OK') # The following should not be skipped, they should finish with an OK status. - self.assertRegex(output.decode('utf-8'), 'test_harness\.heavy_a .*? OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.heavy_b .*? OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.heavy_on_not_heavy .*? OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.heavy_on_heavy_a_and_heavy_b .*? OK') - self.assertRegex(output.decode('utf-8'), 'test_harness\.heavy_on_not_heavy_a_and_not_heavy_b .*? OK') + self.assertRegex(output, 'test_harness\.heavy_a .*? OK') + self.assertRegex(output, 'test_harness\.heavy_b .*? OK') + self.assertRegex(output, 'test_harness\.heavy_on_not_heavy .*? OK') + self.assertRegex(output, 'test_harness\.heavy_on_heavy_a_and_heavy_b .*? OK') + self.assertRegex(output, 'test_harness\.heavy_on_not_heavy_a_and_not_heavy_b .*? OK') # The following should not be skipped, and should not list [implicit heavy] caveat. # (a little redundant, but I don't see a way to check for this and the OK test above, in one go) - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.heavy_a .*? \[IMPLICT HEAVY\] OK') - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.heavy_b .*? \[IMPLICT HEAVY\] OK') - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.heavy_on_not_heavy .*? \[IMPLICT HEAVY\] OK') - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.heavy_on_heavy_a_and_heavy_b .*? \[IMPLICT HEAVY\] OK') - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.heavy_on_not_heavy_a_and_not_heavy_b .*? \[IMPLICT HEAVY\] OK') + self.assertNotRegex(output, 'test_harness\.heavy_a .*? \[IMPLICT HEAVY\] OK') + self.assertNotRegex(output, 'test_harness\.heavy_b .*? \[IMPLICT HEAVY\] OK') + self.assertNotRegex(output, 'test_harness\.heavy_on_not_heavy .*? \[IMPLICT HEAVY\] OK') + self.assertNotRegex(output, 'test_harness\.heavy_on_heavy_a_and_heavy_b .*? \[IMPLICT HEAVY\] OK') + self.assertNotRegex(output, 'test_harness\.heavy_on_not_heavy_a_and_not_heavy_b .*? \[IMPLICT HEAVY\] OK') # The following should not run at all (the test is silent, and not displayed in the output) - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.singleton.*?') - self.assertNotRegex(output.decode('utf-8'), 'test_harness\.not_heavy_on_singleton_a_and_singleton_b.*?') + self.assertNotRegex(output, 'test_harness\.singleton.*?') + self.assertNotRegex(output, 'test_harness\.not_heavy_on_singleton_a_and_singleton_b.*?') diff --git a/python/TestHarness/tests/test_Syntax.py b/python/TestHarness/tests/test_Syntax.py index 84321c3fdb2d..7de9146bf8d8 100644 --- a/python/TestHarness/tests/test_Syntax.py +++ b/python/TestHarness/tests/test_Syntax.py @@ -16,22 +16,22 @@ def testSyntax(self): """ # Test that the SYNTAX PASS status message properly displays - output = self.runTests('-i', 'syntax').decode('utf-8') + output = self.runTests('-i', 'syntax') self.assertIn('SYNTAX PASS', output) # Test that the SYNTAX PASS status message properly displays - output = self.runTests('--check-input', '-i', 'syntax').decode('utf-8') + output = self.runTests('--check-input', '-i', 'syntax') self.assertIn('SYNTAX PASS', output) # Check that the _non_ SYNTAX test was not run - output = self.runTests('--check-input', '-i', 'no_syntax').decode('utf-8') + output = self.runTests('--check-input', '-i', 'no_syntax') self.assertNotIn('SYNTAX PASS', output) # Check that _thee_ SYNTAX test is not run - output = self.runTests('--no-check-input', '-i', 'syntax').decode('utf-8') + output = self.runTests('--no-check-input', '-i', 'syntax') self.assertNotIn('SYNTAX PASS', output) # Check that it is skipped when running valgrind - output = self.runTests('--valgrind', '-i', 'syntax').decode('utf-8') + output = self.runTests('--valgrind', '-i', 'syntax') self.assertIn('CHECK_INPUT==TRUE', output) self.checkStatus(output, skipped=1) diff --git a/python/TestHarness/tests/test_Timeout.py b/python/TestHarness/tests/test_Timeout.py index 4237807231ef..e36b56b106e9 100644 --- a/python/TestHarness/tests/test_Timeout.py +++ b/python/TestHarness/tests/test_Timeout.py @@ -19,7 +19,7 @@ def testTimeout(self): self.runTests('-i', 'timeout') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), 'test_harness\.timeout.*?TIMEOUT') + self.assertRegex(e.output, 'test_harness\.timeout.*?TIMEOUT') # Verify return code is TIMEOUT related (0x1) self.assertIs(0x1, e.returncode) @@ -34,7 +34,7 @@ def testTimeoutEnv(self): os.environ.pop('MOOSE_TEST_MAX_TIME') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), 'test_harness\.timeout.*?TIMEOUT') + self.assertRegex(e.output, 'test_harness\.timeout.*?TIMEOUT') # Verify return code is TIMEOUT related (0x1) self.assertIs(0x1, e.returncode) diff --git a/python/TestHarness/tests/test_TrimOutput.py b/python/TestHarness/tests/test_TrimOutput.py index 17c739b14811..3979793859f9 100644 --- a/python/TestHarness/tests/test_TrimOutput.py +++ b/python/TestHarness/tests/test_TrimOutput.py @@ -16,14 +16,14 @@ def testTrimOutput(self): Verify output exceeded buffer, and is therfore trimmed """ output = self.runTests('--no-color', '-i', 'trimmed_output', '-v') - self.assertIn('Output trimmed', output.decode('utf-8')) + self.assertIn('Output trimmed', output) def testNoTrimOutput(self): """ Verify trimming did not take place """ output = self.runTests('--no-color', '-i', 'always_ok', '-v') - self.assertNotIn('Output trimmed', output.decode('utf-8')) + self.assertNotIn('Output trimmed', output) def testNoTrimmedOutputOnError(self): """ @@ -34,4 +34,4 @@ def testNoTrimmedOutputOnError(self): self.runTests('--no-color', '-i', 'no_trim_on_error', '--no-trimmed-output-on-error', '-v') e = cm.exception - self.assertNotIn('Output trimmed', e.output.decode('utf-8')) + self.assertNotIn('Output trimmed', e.output) diff --git a/python/TestHarness/tests/test_UnknownParam.py b/python/TestHarness/tests/test_UnknownParam.py index e3a4d662a962..8c77f0de4d9e 100644 --- a/python/TestHarness/tests/test_UnknownParam.py +++ b/python/TestHarness/tests/test_UnknownParam.py @@ -15,4 +15,4 @@ def testUnknownParam(self): self.runTests('--no-color', '-i', 'unknown_param') self.assertIn('unknown_param:5: unused parameter "not_a_parameter"', - cm.exception.output.decode('utf-8')) + cm.exception.output) diff --git a/python/TestHarness/tests/test_UnknownPrereq.py b/python/TestHarness/tests/test_UnknownPrereq.py index f10b7058247b..2887211f136c 100644 --- a/python/TestHarness/tests/test_UnknownPrereq.py +++ b/python/TestHarness/tests/test_UnknownPrereq.py @@ -19,4 +19,4 @@ def testUnknownPrereq(self): self.runTests('-i', 'unknown_prereq') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.foo.*?FAILED \(unknown dependency\)') + self.assertRegex(e.output, r'tests/test_harness.foo.*?FAILED \(unknown dependency non_existent\)') diff --git a/python/TestHarness/tests/test_UnreadableOutput.py b/python/TestHarness/tests/test_UnreadableOutput.py index 21ef17dbb9f2..d1712a58d8e4 100644 --- a/python/TestHarness/tests/test_UnreadableOutput.py +++ b/python/TestHarness/tests/test_UnreadableOutput.py @@ -19,4 +19,4 @@ def testUnreadableOutput(self): with self.assertRaises(subprocess.CalledProcessError) as cm: self.runTests('-i', 'non_unicode') e = cm.exception - self.assertIn('non-unicode characters in output', e.output.decode('utf-8')) + self.assertIn('non-unicode characters in output', e.output) diff --git a/python/TestHarness/tests/test_WorkingDirectory.py b/python/TestHarness/tests/test_WorkingDirectory.py index a959ad87398f..754fd6a49118 100644 --- a/python/TestHarness/tests/test_WorkingDirectory.py +++ b/python/TestHarness/tests/test_WorkingDirectory.py @@ -17,7 +17,7 @@ def testWorkingDirectoryGood(self): """ # Test a working scenario output = self.runTests('--no-color', '-i', 'working_directory', '--re', 'relative_and_available') - self.assertRegex(output.decode('utf-8'), 'tests/test_harness.relative_and_available.*? OK') + self.assertRegex(output, 'tests/test_harness.relative_and_available.*? OK') def testDependency(self): @@ -26,7 +26,7 @@ def testDependency(self): self.runTests('--no-color', '-i', 'working_directory') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.depend_on_available.*? OK') + self.assertRegex(e.output, r'tests/test_harness.depend_on_available.*? OK') def testAbsolutePath(self): # Test we catch an absolute path @@ -34,7 +34,7 @@ def testAbsolutePath(self): self.runTests('--no-color', '-i', 'working_directory', '--re', 'absolute_path') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.absolute_path.*? FAILED \(ABSOLUTE PATH DETECTED\)') + self.assertRegex(e.output, r'tests/test_harness.absolute_path.*? FAILED \(ABSOLUTE PATH DETECTED\)') def testDirectoryNotFound(self): # Test we catch a directory not found @@ -42,7 +42,7 @@ def testDirectoryNotFound(self): self.runTests('--no-color', '-i', 'working_directory', '--re', 'non_existent') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.non_existent.*? FAILED \(WORKING DIRECTORY NOT FOUND\)') + self.assertRegex(e.output, r'tests/test_harness.non_existent.*? FAILED \(WORKING DIRECTORY NOT FOUND\)') def testExodiff(self): ## Specific Testers ## @@ -51,7 +51,7 @@ def testExodiff(self): self.runTests('--no-color', '-i', 'working_directory', '--re', 'exodiff') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.exodiff.*? FAILED \(EXODIFF\)') + self.assertRegex(e.output, r'tests/test_harness.exodiff.*? FAILED \(EXODIFF\)') def testCSVDiff(self): # csvdiff can access sub directories @@ -59,7 +59,7 @@ def testCSVDiff(self): self.runTests('--no-color', '-i', 'working_directory', '--re', 'csvdiff') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.csvdiff.*? FAILED \(Override inputs not the same length\)') + self.assertRegex(e.output, r'tests/test_harness.csvdiff.*? FAILED \(Override inputs not the same length\)') def testRunException(self): # RunException can access sub directories @@ -67,4 +67,4 @@ def testRunException(self): self.runTests('--no-color', '-i', 'working_directory', '--re', 'runexception') e = cm.exception - self.assertRegex(e.output.decode('utf-8'), r'tests/test_harness.runexception.*? FAILED \(EXPECTED ERROR MISSING\)') + self.assertRegex(e.output, r'tests/test_harness.runexception.*? FAILED \(EXPECTED ERROR MISSING\)') diff --git a/python/TestHarness/tests/test_WriteResults.py b/python/TestHarness/tests/test_WriteResults.py index 1845d3e4cd5e..c1a9493edf1c 100644 --- a/python/TestHarness/tests/test_WriteResults.py +++ b/python/TestHarness/tests/test_WriteResults.py @@ -7,62 +7,60 @@ #* Licensed under LGPL 2.1, please see LICENSE for details #* https://www.gnu.org/licenses/lgpl-2.1.html -import shutil, os, subprocess +import json, os, subprocess, tempfile from TestHarnessTestCase import TestHarnessTestCase class TestHarnessTester(TestHarnessTestCase): - def setUp(self): - """ - setUp occurs before every test. Clean up previous results file - """ - self.output_dir = os.path.join(os.getenv('MOOSE_DIR'), 'test', 'WriteResults_OUTPUT') - - try: - # remove previous results file - shutil.rmtree(self.output_dir) - except: - pass - def tearDown(self): """ tearDown occurs after every test. """ self.setUp() - def testWriteOK(self): - """ Test ability to write separate OK test --sep-files-ok """ - self.runTests('--no-color', '-i', 'always_ok', '--sep-files-ok', '--output-dir', self.output_dir) - if not os.path.exists(os.path.join(self.output_dir, 'test_harness.always_ok.OK.txt')): - self.fail('Failed to create sep-files-ok') + def checkFilesExist(self, output_dir, tests, output_object_names): + # The directories within the test directory where these tests reside + test_folders = ['tests', 'test_harness'] + # The complete path to the directory where the tests reside + test_base_path = os.path.join(os.getenv('MOOSE_DIR'), 'test', *test_folders) + # The complete path where the output should reside + output_base_path = os.path.join(output_dir, *test_folders) - with self.assertRaises(subprocess.CalledProcessError): - self.runTests('--no-color', '-i', 'diffs', '--sep-files-ok', '--output-dir', self.output_dir) + # Load the previous results + with open(os.path.join(output_dir, '.previous_test_results.json')) as f: + results = json.load(f) + test_results = results['tests'] + # We should only have one test spec + self.assertEqual(1, len(test_results)) + # The test spec should be in the results + self.assertIn(test_base_path, test_results) + test_spec_results = test_results[test_base_path] + # The number of tests in the test spec should be the number provided + self.assertEqual(len(tests), len(test_spec_results)) - if (os.path.exists(os.path.join(self.output_dir, 'test_harness.exodiff.DIFF.txt')) - or os.path.exists(os.path.join(self.output_dir, 'test_harness.exodiff.OK.txt'))): - self.fail('Test results which failed were created when asked NOT to do so: --sep-files-ok') - - def testWriteFail(self): - """ Test ability to write separate Fail test --sep-files-fail """ - with self.assertRaises(subprocess.CalledProcessError): - self.runTests('--no-color', '-i', 'diffs', '--sep-files-fail', '--output-dir', self.output_dir) - - if not (os.path.exists(os.path.join(self.output_dir, 'test_harness.exodiff.DIFF.txt')) - and os.path.exists(os.path.join(self.output_dir, 'test_harness.csvdiff.DIFF.txt'))): - self.fail('Failed to create sep-files-fail') - - self.runTests('--no-color', '-i', 'always_ok', '--sep-files-fail', '--output-dir', self.output_dir) - if os.path.exists(os.path.join(self.output_dir, 'test_harness.always_ok.OK.txt')): - self.fail('Test results which passed were created when asked NOT to do so: --sep-files-fail') + for test in tests: + # The test name should be in the test spec results + test_name_short = f'{"/".join(test_folders)}.{test}' + self.assertIn(test_name_short, test_spec_results) + test_results = test_spec_results[test_name_short] + # Get the output files from the test spec + result_output_files = test_results['output_files'] + # Make sure each output file exists and is set in the results file + for name in output_object_names: + output_path = f'{output_base_path}/{test}.{name}_out.txt' + self.assertTrue(os.path.exists(output_path)) + self.assertEqual(result_output_files[name], output_path) + # And make sure that we don't have output from any other objects + for name, output_path in result_output_files.items(): + if name not in output_object_names: + self.assertEqual(output_path, None) def testWriteAll(self): """ Test write all output files --sep-files """ - with self.assertRaises(subprocess.CalledProcessError): - self.runTests('--no-color', '-i', 'diffs', '--sep-files', '--output-dir', self.output_dir) - - self.runTests('--no-color', '-i', 'always_ok', '--sep-files', '--output-dir', self.output_dir) + with tempfile.TemporaryDirectory() as output_dir: + with self.assertRaises(subprocess.CalledProcessError): + self.runTests('--no-color', '-i', 'diffs', '--sep-files', '-o', output_dir, tmp_output=False) + self.checkFilesExist(output_dir, ['csvdiff', 'exodiff'], ['runner_run', 'tester']) - if not (os.path.exists(os.path.join(self.output_dir, 'test_harness.always_ok.OK.txt')) - and os.path.exists(os.path.join(self.output_dir, 'test_harness.csvdiff.DIFF.txt')) - and os.path.exists(os.path.join(self.output_dir, 'test_harness.exodiff.DIFF.txt'))): - self.fail('Failed to create all output files --sep-files') + with tempfile.TemporaryDirectory() as output_dir: + self.runTests('--no-color', '-i', 'always_ok', '--sep-files', '-o', output_dir, tmp_output=False) + self.checkFilesExist(output_dir, ['always_ok'], ['runner_run']) diff --git a/python/TestHarness/tests/tests b/python/TestHarness/tests/tests index 68689f62b1b0..794b54b3698d 100644 --- a/python/TestHarness/tests/tests +++ b/python/TestHarness/tests/tests @@ -183,12 +183,6 @@ requirement = "The system shall be able to perform recovery of a test" issues = '#11492' [] - [pbs_tests] - type = PythonUnitTest - input = test_PBS.py - requirement = "The system shall be able to submit jobs to a PBS third party scheduler" - issues = '#12138' - [] [trim_output] type = PythonUnitTest input = test_TrimOutput.py @@ -287,4 +281,10 @@ requirement = "The system shall not skip non-heavy tests for which heavy tests depend on" issues = '#26215' [] + [test_output_interface] + type = PythonUnitTest + input = test_OutputInterface.py + requirement = 'The system shall provide a common interface for storing and retrieving output that supports sanitization + issues = '#27562' + [] [] diff --git a/python/TestHarness/util.py b/python/TestHarness/util.py index f89ae79423af..a00ad0e6a8d4 100644 --- a/python/TestHarness/util.py +++ b/python/TestHarness/util.py @@ -15,9 +15,6 @@ import yaml import sys -TERM_COLS = int(os.getenv('MOOSE_TERM_COLS', '110')) -TERM_FORMAT = os.getenv('MOOSE_TERM_FORMAT', 'njcst') - MOOSE_OPTIONS = { 'ad_size' : { 're_option' : r'#define\s+MOOSE_AD_MAX_DOFS_PER_ELEM\s+(\d+)', 'default' : '64' @@ -262,8 +259,8 @@ def formatStatusMessage(job, status, message, options): # 2) the color parameter is False. def formatResult(job, options, result='', color=True, **kwargs): # Support only one instance of a format identifier, but obey the order - terminal_format = list(OrderedDict.fromkeys(list(TERM_FORMAT))) - status, message, message_color, exit_code, sort_value = job.getJointStatus() + terminal_format = list(OrderedDict.fromkeys(list(options.term_format))) + joint_status = job.getJointStatus() color_opts = {'code' : options.code, 'colored' : options.colored} @@ -285,18 +282,18 @@ def formatResult(job, options, result='', color=True, **kwargs): justification_index = terminal_format[i] if str(f_key).lower() == 'p': - pre_result = ' '*(8-len(status)) + status - formatCase(f_key, (pre_result, message_color), formatted_results) + pre_result = ' '*(8-len(joint_status.status)) + joint_status.status + formatCase(f_key, (pre_result, joint_status.color), formatted_results) if str(f_key).lower() == 's': if not result: - result = formatStatusMessage(job, status, message, options) + result = formatStatusMessage(job, joint_status.status, joint_status.message, options) # refrain from printing a duplicate pre_result if it will match result - if 'p' in [x.lower() for x in terminal_format] and result == status: + if 'p' in [x.lower() for x in terminal_format] and result == joint_status.status: formatCase(f_key, None, formatted_results) else: - formatCase(f_key, (result, message_color), formatted_results) + formatCase(f_key, (result, joint_status.color), formatted_results) if str(f_key).lower() == 'n': formatCase(f_key, (job.getTestName(), None), formatted_results) @@ -313,7 +310,7 @@ def formatResult(job, options, result='', color=True, **kwargs): # Decorate Caveats if job.getCaveats() and caveat_index is not None and 'caveats' in kwargs and kwargs['caveats']: caveats = ','.join(job.getCaveats()) - caveat_color = message_color + caveat_color = joint_status.color if not job.isFail(): caveat_color = 'CYAN' @@ -322,12 +319,12 @@ def formatResult(job, options, result='', color=True, **kwargs): character_count = resultCharacterCount(formatted_results) + len(f_caveats) + 1 # If caveats are the last items the user wants printed, or -e (extra_info) is - # called, allow caveats to consume available character count beyond TERM_COLS. + # called, allow caveats to consume available character count beyond options.term_cols. # Else, we trim caveats: if terminal_format[-1].lower() != 'c' \ and not options.extra_info \ - and character_count > TERM_COLS: - over_by_amount = character_count - TERM_COLS + and character_count > options.term_cols: + over_by_amount = character_count - options.term_cols f_caveats = '[' + caveats[:len(caveats) - (over_by_amount + 3)] + '...]' formatCase(caveat_index, (f_caveats, caveat_color), formatted_results) @@ -337,9 +334,9 @@ def formatResult(job, options, result='', color=True, **kwargs): j_dot = None # +1 space created later by join character_count = resultCharacterCount(formatted_results) + 1 - if character_count < TERM_COLS: - j_dot = ('.'*max(0, (TERM_COLS - character_count)), 'GREY') - elif character_count == TERM_COLS: + if character_count < options.term_cols: + j_dot = ('.'*max(0, (options.term_cols - character_count)), 'GREY') + elif character_count == options.term_cols: j_dot = ('', 'GREY') formatCase(justification_index, j_dot, formatted_results) @@ -872,60 +869,27 @@ def deleteFilesAndFolders(test_dir, paths, delete_folders=True): # TL;DR; Just pass... pass -# Check if test has any redirected output, and if its ready to be read -def checkOutputReady(tester, options): - checked_files = [] - for redirected_file in tester.getRedirectedOutputFiles(options): - file_path = os.path.join(tester.getTestDir(), redirected_file) - if os.access(file_path, os.R_OK): - checked_files.append(file_path) - return checked_files - -# return concatenated output from tests with redirected output -def getOutputFromFiles(tester, options): - file_output = '' - output_files = checkOutputReady(tester, options) - for file_path in output_files: - with open(file_path, 'r+b') as f: - file_output += "#"*80 + "\nOutput from " + file_path \ - + "\n" + "#"*80 + "\n" + readOutput(f, None, tester) - return file_output - -# Read stdout and stderr file objects, append error and return the string -def readOutput(stdout, stderr, tester): - output = '' - try: - if stdout: - stdout.seek(0) - output += stdout.read().decode('utf-8') - if stderr: - stderr.seek(0) - output += stderr.read().decode('utf-8') - except UnicodeDecodeError: - tester.setStatus(tester.fail, 'non-unicode characters in output') - except: - tester.setStatus(tester.fail, 'error while attempting to read output files') - - return output - -# Trimming routines for job output -def trimOutput(job, options): - output = job.getOutput() - if ((job.isFail() and options.no_trimmed_output_on_error) - or (job.specs.isValid('max_buffer_size') and job.specs['max_buffer_size'] == -1) - or options.no_trimmed_output): - return output - elif job.specs.isValid('max_buffer_size'): - max_size = int(job.specs['max_buffer_size']) - else: - max_size = 100000 - - if len(output) <= max_size: +def trimOutput(output, max_size=None): + """ Trims the output given some max size """ + if not max_size or len(output) < max_size or not output: return output first_part = int(max_size*(2.0/3.0)) second_part = int(max_size*(1.0/3.0)) - return "%s\n%s\n\nOutput trimmed\n\n%s\n%s" % (output[:first_part], - "#"*80, - "#"*80, - output[-second_part:]) + trimmed = f'{output[:first_part]}' + if trimmed[-1] != '\n': + trimmed += '\n' + sep = "#" * 80 + trimmed += f'\n{sep}\nOutput trimmed\n{sep}\n{output[-second_part:]}' + return trimmed + +def outputHeader(header, ending=True): + """ + Returns text for output with a visual separator, i.e.: + ##############################... +
+ ##############################... + """ + begin_sep = '#' * 80 + end_sep = f'{begin_sep}\n' if ending else '' + return f'{begin_sep}\n{header}\n{end_sep}' diff --git a/python/contrib/dag/__init__.py b/python/contrib/dag/__init__.py index ca7a75d1386b..4678a93a14b4 100644 --- a/python/contrib/dag/__init__.py +++ b/python/contrib/dag/__init__.py @@ -316,3 +316,17 @@ def delete_edge_if_exists(self, ind_node, dep_node, graph=None): if dep_node not in graph.get(ind_node, []): return graph[ind_node].remove(dep_node) + + # Added by the MOOSE group + def is_dependency(self, ind_node, dep_node, graph=None): + """ Returns whether or not dep_node depends on ind_node """ + if not graph: + graph = self.graph + + deps = graph[ind_node] + if dep_node in deps: + return True + for node in deps: + if self.is_dependency(node, dep_node, graph=graph): + return True + return False diff --git a/python/doc/content/python/TestHarness.md b/python/doc/content/python/TestHarness.md index 5fb50931d86d..6609ad163640 100644 --- a/python/doc/content/python/TestHarness.md +++ b/python/doc/content/python/TestHarness.md @@ -89,7 +89,6 @@ Tester system is completely pluggable and extendable. The list of default tester - [AnalyzeJacobian](AnalyzeJacobian.md) - [PetscJacobianTester](PetscJacobianTester.md) - [PythonUnitTest](PythonUnitTest.md) (includes [Method of Manufactured solutions](python/mms.md) testing) -- [BenchmarkTesting](application_development/performance_benchmarking.md) ## Test Specifications diff --git a/python/mooseutils/mooseutils.py b/python/mooseutils/mooseutils.py index 42539e23646c..616fe4129c29 100644 --- a/python/mooseutils/mooseutils.py +++ b/python/mooseutils/mooseutils.py @@ -165,7 +165,7 @@ def run_executable(app_path, *args, mpi=None, suppress_output=False): A function for running an application. """ import subprocess - if mpi and isinstance(mpi, int): + if mpi and isinstance(mpi, int) and mpi > 1: cmd = ['mpiexec', '-n', str(mpi), app_path] else: cmd = [app_path] diff --git a/test/tests/kernels/simple_diffusion/speedtests b/test/tests/kernels/simple_diffusion/speedtests deleted file mode 100644 index b3bb04633781..000000000000 --- a/test/tests/kernels/simple_diffusion/speedtests +++ /dev/null @@ -1,17 +0,0 @@ -[Benchmarks] - [diffusion_100x100] - type = SpeedTest - input = simple_diffusion.i - cli_args = 'Mesh/nx=100 Mesh/ny=100' - [] - [diffusion_200x200] - type = SpeedTest - input = simple_diffusion.i - cli_args = 'Mesh/nx=200 Mesh/ny=200' - [] - [uniform_refine_4] - type = SpeedTest - input = simple_diffusion.i - cli_args = 'Mesh/uniform_refine=4' - [] -[] diff --git a/test/tests/kernels/simple_diffusion/tests b/test/tests/kernels/simple_diffusion/tests index 4a82308a779e..0b9c12b0e446 100644 --- a/test/tests/kernels/simple_diffusion/tests +++ b/test/tests/kernels/simple_diffusion/tests @@ -7,5 +7,8 @@ issues = '#1493' design = 'kernels/Diffusion.md' requirement = 'The system shall run a simple 2D linear diffusion problem with Dirichlet boundary conditions on a regular mesh.' + + # Enables running the limited HPC tests on CIVET on all events + group = 'hpc' [] [] diff --git a/test/tests/kernels/simple_transient_diffusion/speedtests b/test/tests/kernels/simple_transient_diffusion/speedtests deleted file mode 100644 index 004804b52054..000000000000 --- a/test/tests/kernels/simple_transient_diffusion/speedtests +++ /dev/null @@ -1,13 +0,0 @@ -[Benchmarks] - [./trans_diffusion_100x100_t5] - type = SpeedTest - input = simple_transient_diffusion.i - cli_args = 'Mesh/nx=100 Mesh/ny=100 Executioner/num_steps=5' - [../] - [./trans_diffusion_100x100_t10] - type = SpeedTest - input = simple_transient_diffusion.i - cli_args = 'Mesh/nx=100 Mesh/ny=100 Executioner/num_steps=10' - prereq = 'trans_diffusion_100x100_t5' - [../] -[] diff --git a/test/tests/meshgenerators/distributed_rectilinear/generator/tests b/test/tests/meshgenerators/distributed_rectilinear/generator/tests index bc80d0e11f78..2b6094159e67 100644 --- a/test/tests/meshgenerators/distributed_rectilinear/generator/tests +++ b/test/tests/meshgenerators/distributed_rectilinear/generator/tests @@ -33,6 +33,8 @@ cli_args = 'Mesh/gmg/dim=3 Mesh/gmg/nx=20 Mesh/gmg/ny=20 Mesh/gmg/nz=20 Outputs/file_base=distributed_rectilinear_mesh_generator_out_3d Outputs/hide="pid npid" ' requirement = 'The system shall be able to generate 3D HEX8 mesh in parallel.' valgrind = 'NONE' + # Enables running the limited HPC tests on CIVET on all events + group = 'hpc' [../] [./3D_ptscotch] diff --git a/test/tests/misc/intermittent_failure/tests b/test/tests/misc/intermittent_failure/tests index d33b0bd8c830..9221f6868ce7 100644 --- a/test/tests/misc/intermittent_failure/tests +++ b/test/tests/misc/intermittent_failure/tests @@ -10,5 +10,8 @@ valgrind = 'NONE' method = '!dbg' + + # HPC runs could lead to a timeout + hpc = false [../] [] diff --git a/test/tests/misc/mpi_setup/tests b/test/tests/misc/mpi_setup/tests deleted file mode 100644 index c5ce954d2521..000000000000 --- a/test/tests/misc/mpi_setup/tests +++ /dev/null @@ -1,9 +0,0 @@ -[Tests] - [basic_mpirun_works] - type = RunCommand - command = 'mpirun -n 2 hostname' - requirement = "The system's test suite shall verify that the parallel environment is at least minimally working." - issues = '#22635' - design = 'MooseUtils.md' - [] -[] diff --git a/test/tests/misc/solution_invalid/tests b/test/tests/misc/solution_invalid/tests index 0a8af98cf4ab..b20f3627604d 100644 --- a/test/tests/misc/solution_invalid/tests +++ b/test/tests/misc/solution_invalid/tests @@ -60,6 +60,7 @@ type = JSONDiff input = solution_invalid.i jsondiff = 'solution_invalid.json' + hpc = false # iteration counts can be MPI dependent design = 'SolutionInvalidity.md' issues = '#22814' requirement = 'The system shall be able to output detailed reasons and occurrences as to why a solution is invalid to file' @@ -72,6 +73,7 @@ input = solution_invalid.i cli_args = "Executioner/type=Transient Executioner/num_steps=1 Executioner/dtmin=1e-13 Executioner/error_on_dtmin=false Outputs/file_base='solution_invalid_transient' " jsondiff = 'solution_invalid_transient.json' + hpc = false # iteration counts can be MPI dependent design = 'SolutionInvalidity.md' issues = '#22814' requirement = 'The system shall be able to output detailed information about why a solution is invalid to a file in transient simulations' @@ -84,6 +86,7 @@ input = solution_invalid.i cli_args = "Materials/filter/test_different_procs=true Outputs/file_base='solution_invalid_parallel'" jsondiff = 'solution_invalid_parallel.json' + hpc = false # iteration counts can be MPI dependent design = 'SolutionInvalidity.md' min_parallel = 3 max_parallel = 3 @@ -113,6 +116,7 @@ type = JSONDiff input = solution_invalid_recover.i jsondiff = 'solution_invalid_checkpoint.json' + hpc = false # iteration counts can be MPI dependent cli_args = "Outputs/file_base='solution_invalid_checkpoint' Outputs/checkpoint=true" detail = 'outputting of checkpoint files' [] diff --git a/test/tests/outputs/iterative/tests b/test/tests/outputs/iterative/tests index b25cd2a457f8..a09ab47ac88e 100644 --- a/test/tests/outputs/iterative/tests +++ b/test/tests/outputs/iterative/tests @@ -78,6 +78,8 @@ recover = false petsc_version_release = true compiler = '!INTEL' + # MPI dependent + hpc = false requirement = "The system shall support output during linear and non-linear iterations during transient simulations using the CSV format." [../] diff --git a/test/tests/postprocessors/find_value_on_line/tests b/test/tests/postprocessors/find_value_on_line/tests index 84cd58ccc07b..d5f2b216e27d 100644 --- a/test/tests/postprocessors/find_value_on_line/tests +++ b/test/tests/postprocessors/find_value_on_line/tests @@ -67,7 +67,7 @@ [./line_out_of_bounds] type = 'RunException' input = 'findvalueonline.i' - cli_args = "\"Postprocessors/pos/end_point='11 0 0' Outputs/csv=false\"" + cli_args = "Postprocessors/pos/end_point='11 0 0' Outputs/csv=false" expect_err = "No element found at the current search point" requirement = 'The system shall report and error when the sampling line extends beyond the mesh bounding box.' diff --git a/test/tests/postprocessors/num_residual_eval/tests b/test/tests/postprocessors/num_residual_eval/tests index 99e6fb9b7172..a897304d744a 100644 --- a/test/tests/postprocessors/num_residual_eval/tests +++ b/test/tests/postprocessors/num_residual_eval/tests @@ -13,6 +13,8 @@ # This test requires a different number of residual evaluations in older PETScs # due to the changes in 3061bbd5d. petsc_version = '>=3.8.3' + # Difficult to have consistent solve behavior when running across nodes + hpc = false requirement = 'The system shall be capable of outputting the number of Residual evaluations' issues = '#2089' diff --git a/test/tests/restart/advanced_stateful_material/tests b/test/tests/restart/advanced_stateful_material/tests index 744184d193c5..ca4a8433d6ca 100644 --- a/test/tests/restart/advanced_stateful_material/tests +++ b/test/tests/restart/advanced_stateful_material/tests @@ -6,6 +6,10 @@ type = RunApp input = advanced_stateful_material.i requirement = 'The system shall be able to generate a checkpoint of stateful material properties that supports the advanced restart of the properties' + # RunException with some cases will only run in serial, which means if + # "checkpoint" runs in parallel we'll actually get an error with a + # processor mismatch instead of the one that we want + hpc = false [] [errors] diff --git a/test/tests/samplers/distribute/tests b/test/tests/samplers/distribute/tests index 591d226c918e..8fc74a0b88af 100644 --- a/test/tests/samplers/distribute/tests +++ b/test/tests/samplers/distribute/tests @@ -14,6 +14,7 @@ python = 3 required_python_packages = 'pandas matplotlib' installation_type = in_tree # see #26480 + hpc = false # see #26480 [] [plot] type = CheckFiles @@ -26,6 +27,7 @@ required_python_packages = 'pandas matplotlib' detail = "demonstrates efficient parallel scaling of memory use." installation_type = in_tree # see #26480 + hpc = false # see #26480 [] [] [] diff --git a/test/tests/test_harness/duplicate_outputs_prereqs b/test/tests/test_harness/duplicate_outputs_prereqs deleted file mode 100644 index 962258e7a0c2..000000000000 --- a/test/tests/test_harness/duplicate_outputs_prereqs +++ /dev/null @@ -1,27 +0,0 @@ -[Tests] - # Needed because the default of false will cause the - # race condition checks to be skipped - parallel_scheduling = true - - [./a] - type = Exodiff - input = good.i - cli_args = "Outputs/file_base=good_exodiff_out Outputs/exodus=true" - exodiff = 'good_exodiff_out.e' - [../] - [./b] - type = Exodiff - input = good.i - exodiff = 'good_out.e' - prereq = 'a' - [../] - - # While there are no immediate race conditions when this spec file is launched, - # it is possible 'a' will finish quickly, allowing 'b' to run while 'c' might - # _still_ be running... and that is a race condition. - [./c] - type = Exodiff - input = good.i - exodiff = 'good_out.e' - [../] -[] diff --git a/test/tests/test_harness/multiple_duplicate_outputs b/test/tests/test_harness/multiple_duplicate_outputs deleted file mode 100644 index 0edc754ef2be..000000000000 --- a/test/tests/test_harness/multiple_duplicate_outputs +++ /dev/null @@ -1,7 +0,0 @@ -[Tests] - [./a] - type = Exodiff - input = good.i - exodiff = 'foo.e foo.e' - [../] -[] diff --git a/test/tests/time_steppers/time_adaptive/tests b/test/tests/time_steppers/time_adaptive/tests index 004b4c1990eb..211b25805acd 100644 --- a/test/tests/time_steppers/time_adaptive/tests +++ b/test/tests/time_steppers/time_adaptive/tests @@ -13,5 +13,8 @@ # the execution slows down so much that the set delays in SlowProblem become # small compared to the overall execution overhead and the test fails. valgrind = NONE + + # HPC runtime isn't reliable enough + hpc = false [] [] diff --git a/test/tests/vectorpostprocessors/work_balance/tests b/test/tests/vectorpostprocessors/work_balance/tests index 5f273b419899..3c67f70d7d6a 100644 --- a/test/tests/vectorpostprocessors/work_balance/tests +++ b/test/tests/vectorpostprocessors/work_balance/tests @@ -13,7 +13,7 @@ min_parallel = 2 max_parallel = 2 mesh_mode = replicated - + hpc = False # different MPI partitioning detail = 'on replicated meshes, and' [] @@ -25,6 +25,7 @@ min_parallel = 2 max_parallel = 2 mesh_mode = distributed + hpc = False # different MPI partitioning detail = 'on distributed meshes.' []