Skip to content

Commit

Permalink
Merge pull request #27561 from loganharbour/apptainer_hpc
Browse files Browse the repository at this point in the history
New HPC scheduler
  • Loading branch information
loganharbour authored Oct 28, 2024
2 parents e96644b + 345144f commit f5a01db
Show file tree
Hide file tree
Showing 120 changed files with 4,224 additions and 2,767 deletions.
11 changes: 11 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -326,3 +326,14 @@ share/
/modules/misc/misc.yaml
/tutorials/tutorial01_app_development/*/babbler.yaml
/tutorials/darcy_thermo_mech/*/darcy_thermo_mech.yaml

# test harness --sep-files output
*.runner_run_out.txt
*.runner_out.txt
*.tester_out.txt
*.job_out.txt

# test harness hpc output
*.hpc_out.txt
*.hpc_result
*.hpc_submit
1 change: 1 addition & 0 deletions CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@

/python/MooseDocs @cticenhour
/python/moosesqa @cticenhour
/python/TestHarness @loganharbour @milljm

/scripts/hpc_proxy.pac @loganharbour
/scripts/configure_petsc.sh @cticenhour @milljm @loganharbour
Expand Down
11 changes: 11 additions & 0 deletions framework/app.mk
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,17 @@ install_data_%:
@mkdir -p $($@_dst)
@cp -r $($@_src) $($@_dst)

ifneq ($(wildcard $(APPLICATION_DIR)/scripts/TestHarness/testers),)
install_tester_$(APPLICATION_NAME)_src := $(APPLICATION_DIR)/scripts/TestHarness/testers
install_tester_$(APPLICATION_NAME)_dst := $(share_install_dir)/scripts/TestHarness
install_testers:: install_tester_$(APPLICATION_NAME)
endif

install_tester_%:
@echo "Installing TestHarness testers "$($@_dst)"..."
@mkdir -p $($@_dst)
@cp -r $($@_src) $($@_dst)

$(copy_input_targets):
@$(eval kv := $(subst ->, ,$(subst target_$(APPLICATION_NAME)_,,$@)))
@$(eval source_dir := $(word 1, $(kv)))
Expand Down
5 changes: 4 additions & 1 deletion framework/moose.mk
Original file line number Diff line number Diff line change
Expand Up @@ -484,12 +484,15 @@ moose_share_dir = $(share_dir)/moose
python_install_dir = $(moose_share_dir)/python
bin_install_dir = $(PREFIX)/bin

install: all install_all_libs install_bin install_harness install_exodiff install_adreal_monolith install_hit install_data
install: all install_all_libs install_bin install_harness install_exodiff install_adreal_monolith install_hit install_data install_testers

install_data::
@mkdir -p $(moose_share_dir)
@cp -a $(FRAMEWORK_DIR)/data $(moose_share_dir)/

install_testers::
@:

install_adreal_monolith: ADRealMonolithic.h
@ mkdir -p $(moose_include_dir)
@cp -f $< $(moose_include_dir)/
Expand Down
2 changes: 0 additions & 2 deletions modules/doc/content/application_development/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@ These documentation pages are meant to be used by developers who are developing

[Test System](/test_system.md) - How to create/maintain tests for your application

[Performance Benchmarking](/performance_benchmarking.md) - How to perform benchmarking

[Profiling](/profiling.md) - How to profile your application in order to determine what functions are hogging compute time.

[Code Coverage](/coverage.md) - How to add automatic code coverage to your application, and use it in your development workflow
Expand Down
Original file line number Diff line number Diff line change
@@ -1,165 +1,4 @@
# Performance Benchmarking

Utilities for doing performance benchmarking of MOOSE-based applications are included in the main
MOOSE repository. These utilities provide functionality for benchmarking and tracking MOOSE
performance. They can be used to run benchmarks, generate trend visualizations, and look at stats
comparing benchmarks between various revisions. The following sections describe how to setup a
benchmark machine and use it to run benchmarks and visualize results.

## Tuning a Benchmarking Machine

In order to obtain accurate results, you need to run the benchmark process(es)
as close to isolated as possible. On a linux system, you should e.g. use cpu
isolation via setting kernel boot parameters:

```text
isolcpus=[n] rcu_nocbs=[n]
```

in your boot loader (e.g. grub). The benchmarking tools/scripts in MOOSE should automatically
detect CPU isolation on Linux and schedule benchmark jobs to those CPUs. You should also disable
any turbo functionality. For example on `intel_pstate` driver cpus:

```text
$ echo "1" > /sys/devices/system/cpu/intel_pstate/no_turbo
```

You will also want to turn off any hyperthreading for cores you use for benchmarking. You can do
this in the bios or by something like:

```text
$ echo "0" > /sys/devices/system/cpu/cpu[n]/online
```

for each hyperthread core you want running - you can look in `/proc/cpuinfo` for pairs of cpus
that have the same core id turning off one of the pair. These will need to be done on every boot.
You can use the sysfsutils package and its `/etc/sysfs.conf` configuration file to do this
persistently on boot - i.e.:

```text
devices/system/cpu/intel_pstate/no_turbo = 1
devices/system/cpu/cpu3/online = 0
devices/system/cpu/cpu5/online = 0
```

## Test Harness Benchmarks

Benchmarks can be run through the test harness (i.e. using the `run_tests` script) by doing
e.g. `./run_tests --run speedtests`. When this is done, the test harness looks for test spec
files named `speedtests` just like the `tests` files that contain regular moose test details.
The format for these files is:

```text
[Benchmarks]
[benchmark-name]
type = SpeedTest
input = input-file-name.i
cli_args = '--an-arg=1 a/hit/format/cli/arg=foo'
# optional:
min_runs = 15 # default 40
max_runs = 100 # default 400
cumulative_dur = 100 # default 60 sec
[]
[./benchmark2-name]
type = SpeedTest
input = another-input-file-name.i
cli_args = 'some/cli/arg=bar'
[]
# ...
[]
```

After being run, benchmark data are stored in a sqlite database (default name
`speedtests.sqlite`). When the test harness is run without the `--run speedtests` flag, tests
described in `speedtests` files are run in *check-only* mode where moose just checks that their
input files are well-formed and parse correctly without actually running them.


## Manual/Direct Benchmarks

The `[moose-repo]/scripts/benchmark.py` script can be used to manually list and directly run benchmarks without the
test harness (for hacking, debugging, etc.). To do this, the script reads a `bench.list` text
file that specifies which input files should be run and corresponding (benchmark) names for them
along with any optional arguments. The `bench.list` file has the following format:

```text
[benchmarks]
[./simple_diffusion_refine3]
binary = test/moose_test-opt
input = test/tests/kernels/simple_diffusion/simple_diffusion.i
cli_args = 'Mesh/uniform_refine=3'
[../]
[./simple_diffusion_refine4]
binary = test/moose_test-opt
input = test/tests/kernels/simple_diffusion/simple_diffusion.i
cli_args = 'Mesh/uniform_refine=4'
[../]
[./simple_diffusion_ref5]
binary = test/moose_test-opt
input = test/tests/kernels/simple_diffusion/simple_diffusion.i
cli_args = 'Mesh/uniform_refine=5'
[../]
# ... add as many as you want
[]
```

To run the manual benchmarks directly, do this:

```text
$ ./scripts/benchmark.py --run
```

When benchmarks are run, the binaries specified in `bench.list` must already exist. Benchmark
data are then stored in a sqlite database (default name `speedtests.sqlite`). You can specify
the minimum number of runs for each benchmark problem/simulation with the `--min-runs` (default
10). Each benchmark will be run as many times as possible within 1 minute (customizable via the
`--cum-dur` flag) or the specified minimum number of times (whichever is larger).

## Analyzing Results

Regardless of how you ran the benchmarks (either by this script or using the test harness), MOOSE
revisions with available benchmark data can be listed (from the database) by running:

```text
$ ./benchmark.py --list-revs
44d2f3434b3346dc14fc9e86aa99ec433c1bbf10 2016-09-07 19:36:16
86ced0d0c959c9bdc59497f0bc9324c5cdcd7e8f 2016-09-08 09:29:17
447b455f1e2d8eda649468ed03ef792504d4b467 2016-09-08 09:43:56
...
```

To look at stats comparing benchmark data from two revisions, run:

```text
$ ./benchmark.py # defaults to using the most recent two revisions of benchmark data
-------------------------------- 871c98630c98 to 38bb6f5ebe5f --------------------------------
benchmark old (sec/run) new (sec/run) speedup (pvalue,nsamples)
----------------------------------------------------------------------------------------------
simple diffusion (refine3): 0.408034 0.408034 ~ (p=0.996 n=36+36)
simple diffusion (refine4): 1.554724 1.561682 ~ (p=0.571 n=10+10)
simple diffusion (refine5): 6.592326 6.592326 ~ (p=0.882 n=4+4)
----------------------------------------------------------------------------------------------
$ ./benchmark.py -old 44d2f34 -new 447b455 # or specify revisions to compare manually
------------------------------------- 44d2f34 to 447b455 -------------------------------------
benchmark old (sec/run) new (sec/run) speedup (pvalue,nsamples)
----------------------------------------------------------------------------------------------
simple diffusion (refine3): 0.416574 0.411435 -1.2% (p=0.000 n=37+37)
simple diffusion (refine4): 1.554724 1.497379 -3.7% (p=0.000 n=10+11)
simple diffusion (refine5): 6.553244 6.360004 -2.9% (p=0.030 n=4+4)
----------------------------------------------------------------------------------------------
```

To generate visualizations, run:

```text
$ ./scripts/benchmark.py --trends
```

This will generate an svg box plot for each benchmark over time/revision in a `trends`
subdirectory. An `index.html` file is also generated that embeds all the svg plots for
convenient viewing all together in a browser.

!alert error title=Removed
This content has been removed.
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,6 @@ There is an entire field of science about [!ac](HPC) and massively parallel proc
!alert tip title=Try to target 20,000 [!ac](DOFs)-per-process.
MOOSE developers tend to agree that 20,000 is the ideal number of [!ac](DOFs) that a single process may be responsible for. This value is reported as "`Num Local DOFs`" in the terminal printout at the beginning of every execution. There are, of course, some exceptions; if a problem exhibits speedup with less than 20,000 [!ac](DOFs)/process, then just use that.

*For more information about application performance, please visit the [application_development/performance_benchmarking.md] page.*

## Demonstration

To demonstrate the importance of parallel execution, the current Darcy pressure input file will be
Expand Down
1 change: 0 additions & 1 deletion modules/doc/content/infrastructure/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,3 @@ of MOOSE and MOOSE-based applications:
- [Python Tools](python/index.md)
- [Build System](/build_system.md)
- [Test System](/test_system.md)
- [Benchmarking](/performance_benchmarking.md)
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
[multiple]
type = RunException
input = 'add_raybc_action.i'
cli_args = 'RayBCs/active=multiple_studies "UserObjects/active=\'study another_study\'"'
cli_args = 'RayBCs/active=multiple_studies UserObjects/active="study another_study"'
expect_err = "While constructing the NullRayBC 'multiple_studies', multiple RayTracingStudy objects were found."
allow_test_objects = true

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
[multiple]
type = RunException
input = 'add_raykernel_action.i'
cli_args = 'RayKernels/active=multiple_studies "UserObjects/active=\'study another_study\'"'
cli_args = 'RayKernels/active=multiple_studies UserObjects/active="study another_study"'
expect_err = "While constructing the NullRayKernel 'multiple_studies', multiple RayTracingStudy objects were found."
allow_test_objects = true

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
Mesh/Partitioner/nx=2
Mesh/Partitioner/ny=2
Outputs/rays/type=RayTracingNemesis
"Outputs/rays/output_properties=\'ray_id intersections pid processor_crossings trajectory_changes\'"
Outputs/rays/output_properties="ray_id intersections pid processor_crossings trajectory_changes"
Outputs/rays/file_base=nemesis_rays'
exodiff = 'nemesis_rays.e.4.0 nemesis_rays.e.4.1 nemesis_rays.e.4.2 nemesis_rays.e.4.3'
min_parallel = 4
Expand All @@ -74,7 +74,7 @@
Mesh/Partitioner/nx=2
Mesh/Partitioner/ny=2
Outputs/rays/type=RayTracingNemesis
"Outputs/rays/output_properties=\'intersections pid processor_crossings trajectory_changes\'"
Outputs/rays/output_properties="intersections pid processor_crossings trajectory_changes"
Outputs/rays/file_base=ray_mesh_output_transient_nemesis_rays'
# Missing some files here beacuse exodiff doesn't like diffing empty output,
# which is the case for the early transient when not all procs are hit
Expand Down Expand Up @@ -107,7 +107,7 @@
cli_args = 'Mesh/Partitioner/type=GridPartitioner
Mesh/Partitioner/ny=3
UserObjects/study/segments_on_cache_traces=false
"Outputs/rays/output_properties=\'ray_id intersections pid processor_crossings trajectory_changes\'"
Outputs/rays/output_properties="ray_id intersections pid processor_crossings trajectory_changes"
Outputs/rays/file_base=no_segments_rays'
exodiff = 'no_segments_rays.e'
min_parallel = 3
Expand All @@ -122,7 +122,7 @@
Mesh/Partitioner/ny=2
Outputs/rays/type=RayTracingNemesis
UserObjects/study/segments_on_cache_traces=false
"Outputs/rays/output_properties=\'ray_id intersections pid processor_crossings trajectory_changes\'"
Outputs/rays/output_properties="ray_id intersections pid processor_crossings trajectory_changes"
Outputs/rays/file_base=no_segments_nemesis_rays'
exodiff = 'no_segments_nemesis_rays.e.2.0 no_segments_nemesis_rays.e.2.1'
min_parallel = 2
Expand Down
14 changes: 7 additions & 7 deletions modules/ray_tracing/test/tests/traceray/internal_sidesets/tests
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
input = 'internal_sidesets_1d.i'
csvdiff = 'internal_sidesets_1d_kill_out.csv'
cli_args = 'Outputs/file_base=internal_sidesets_1d_kill_out
RayBCs/active=\'kill_internal\''
RayBCs/active=kill_internal'
allow_test_objects = true

detail = 'one-dimensional meshes, '
Expand All @@ -19,7 +19,7 @@
input = 'internal_sidesets_2d.i'
csvdiff = 'internal_sidesets_2d_kill_out.csv'
cli_args = 'Outputs/file_base=internal_sidesets_2d_kill_out
RayBCs/active=\'kill_internal\''
RayBCs/active=kill_internal'
allow_test_objects = true

detail = 'two-dimensional meshes, '
Expand All @@ -29,7 +29,7 @@
input = 'internal_sidesets_3d.i'
csvdiff = 'internal_sidesets_3d_kill_out.csv'
cli_args = 'Outputs/file_base=internal_sidesets_3d_kill_out
RayBCs/active=\'kill_internal\''
RayBCs/active=kill_internal'
allow_test_objects = true

detail = 'and three-dimensional meshes.'
Expand All @@ -43,7 +43,7 @@
input = 'internal_sidesets_1d.i'
csvdiff = 'internal_sidesets_1d_reflect_out.csv'
cli_args = 'Outputs/file_base=internal_sidesets_1d_reflect_out
"RayBCs/active=\'kill_external reflect_internal\'"'
RayBCs/active="kill_external reflect_internal"'
allow_test_objects = true

detail = 'one-dimensional meshes, '
Expand All @@ -54,7 +54,7 @@
input = 'internal_sidesets_2d.i'
csvdiff = 'internal_sidesets_2d_reflect_out.csv'
cli_args = 'Outputs/file_base=internal_sidesets_2d_reflect_out
"RayBCs/active=\'kill_external reflect_internal\'"'
RayBCs/active="kill_external reflect_internal"'
allow_test_objects = true

detail = 'two-dimensional meshes, '
Expand All @@ -65,7 +65,7 @@
input = 'internal_sidesets_3d.i'
csvdiff = 'internal_sidesets_3d_reflect_out.csv'
cli_args = 'Outputs/file_base=internal_sidesets_3d_reflect_out
"RayBCs/active=\'kill_external reflect_internal\'"'
RayBCs/active="kill_external reflect_internal"'
allow_test_objects = true

detail = 'and three-dimensional meshes.'
Expand All @@ -76,7 +76,7 @@
type = RunException
input = 'internal_sidesets_1d.i'
cli_args = 'UserObjects/study/use_internal_sidesets=false
RayBCs/active=\'kill_internal\''
RayBCs/active=kill_internal'
expect_err = 'RayBCs are defined on internal sidesets, but the study is not set to use internal sidesets during tracing.'
allow_test_objects = true

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
allow_test_objects = true
cli_args = 'Outputs/file_base=bc_create_ray_3d_out
Mesh/active=gmg_3d
"RayBCs/active=\'kill_3d create_3d\'"'
RayBCs/active="kill_3d create_3d"'
detail = 'and in three-dimensional meshes.'
[]
[]
Expand Down
4 changes: 3 additions & 1 deletion modules/thermal_hydraulics/test/tests/utils/logger/tests
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
[errors]
type = RunException
input = 'test.i'
expect_err = "componentB: warning 2.*componentA: error 1.*componentA: error 2.*componentB: error 1.*componentB: error 2"
# We can't reliably check for the warnings first here because the ordering of the warning
# and the error is MPI implementation dependent
expect_err = "componentA: error 1.*componentA: error 2.*componentB: error 1.*componentB: error 2"
allow_test_objects = true
allow_warnings = true # Testing that warnings are emitted
requirement = 'The system shall be able to output errors in a batch.'
Expand Down
Loading

0 comments on commit f5a01db

Please sign in to comment.