Skip to content

Commit

Permalink
Merge pull request #288 from LLNL/feature/continue_ats
Browse files Browse the repository at this point in the history
Rerun ATS failures automatically
  • Loading branch information
ldowen authored Jul 30, 2024
2 parents 3afb918 + d39a768 commit aace2ac
Show file tree
Hide file tree
Showing 9 changed files with 185 additions and 264 deletions.
20 changes: 17 additions & 3 deletions .gitlab/scripts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@

.build:
stage: build_and_install
variables:
GIT_STRATEGY: none
script:
- CI_BUILD_DIR=$(cat ci-dir.txt)
- cd $CI_BUILD_DIR && cat job-name.txt
Expand All @@ -40,19 +42,25 @@

.run_ats:
stage: run_ats
variables:
GIT_STRATEGY: none
FF_ENABLE_BASH_EXIT_CODE_CHECK: 1
FF_USE_NEW_BASH_EVAL_STRATEGY: 1
script:
- CI_BUILD_DIR=$(cat ci-dir.txt)
- cd $CI_BUILD_DIR && cat job-name.txt

- cat build_gitlab/install/spheral-lcatstest
- $TEST_ALLOC ./build_gitlab/install/spheral-lcatstest --logs test-logs build_gitlab/install/$ATS_FILE --timelimit="45m"
- ./build_gitlab/install/spheral $SCRIPT_DIR/gitlab/run_ats.py --test-alloc "$TEST_ALLOC" --ats-file $ATS_FILE --ci-build-dir $CI_BUILD_DIR || exit_code=$?
- cp -r test-logs $CI_PROJECT_DIR
- ./build_gitlab/install/spheral $SCRIPT_DIR/gitlab/report_results.py
- exit $exit_code
artifacts:
when: always
paths:
- ci-dir.txt
- test-logs/
allow_failure:
exit_codes:
- 80

# ------------------------------------------------------------------------------
# Shared TPL scripts.
Expand All @@ -65,6 +73,8 @@

.toss_update_permissions:
stage: update_permissions
variables:
GIT_STRATEGY: none
script:
- ml load mpifileutils
- srun -N 1 -p $PARTITION -n 20 -t 10 dchmod --mode go+rx $UPSTREAM_DIR
Expand Down Expand Up @@ -123,6 +133,8 @@

.prod_permissions:
stage: update_permissions
variables:
GIT_STRATEGY: none
script:
- INSTALL_DIR=$(cat install-dir.txt)

Expand All @@ -143,6 +155,8 @@
# and never fill the sphapp workspace storage.
.clean_dirs:
stage: cleanup
variables:
GIT_STRATEGY: none
script:
- ml load mpifileutils
- cd $SPHERAL_BUILDS_DIR
Expand Down
7 changes: 5 additions & 2 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,15 @@ Notable changes include:
* RAJA & Umpire added as first level dependencies.
* Axom updated to v0.9.0.
* TPL builds have been split off into a separate Gitlab CI stage to help with timeouts on allocations.
* Failed ATS runs are automatically retested once in the Gitlab CI.
* Python execute command is centralized in scripts/spheralutils.py now.

* Build changes / improvements:
* Distributed source directory must always be built now
* Distributed source directory must always be built now.
* Git strategies in the Gitlab CI are fixed so a clone only occurs on the first stage for each job, instead of for all stages for each job.

* Bug Fixes / improvements:
* Wrappers for MPI calls are simplified and improved
* Wrappers for MPI calls are simplified and improved.

Version v2024.06.1 -- Release date 2024-07-09
==============================================
Expand Down
36 changes: 6 additions & 30 deletions scripts/devtools/host-config-build.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
import os
import sys
import argparse
import subprocess

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from spheralutils import sexe

source_dir=os.getcwd()

Expand Down Expand Up @@ -36,24 +38,6 @@ def parse_args():

return parser.parse_args()


# Helper function for executing commands stolen from uberenv
def sexe(cmd,ret_output=False,echo=False):
""" Helper for executing shell commands. """
if echo:
print("[exe: {0}]".format(cmd))
if ret_output:
p = subprocess.Popen(cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
out = p.communicate()[0]
out = out.decode('utf8')
return p.returncode,out
else:
return subprocess.call(cmd,shell=True)


def main():
args = parse_args()
print(args)
Expand All @@ -63,7 +47,7 @@ def main():
hostconfig_path=args.host_config
else:
hostconfig_path=os.path.abspath(args.host_config)


# Set up our directory structure paths.
if not args.build_dir:
Expand All @@ -76,7 +60,7 @@ def main():
install_dir=args.install_dir
build_dir=build_dir+"/build"
# Pull the cmake command to use out of our host config.
cmake_cmd=sexe("grep 'CMake executable' \"{0}\"".format(hostconfig_path), ret_output=True, echo=True)[1].split()[-1]
cmake_cmd=sexe("grep 'CMake executable' \"{0}\"".format(hostconfig_path), ret_output=True, echo=False).split()[-1]

cmake_extra_args=""
if args.D and args.D != ['']:
Expand Down Expand Up @@ -123,15 +107,7 @@ def main():
print("~~~~~ Building Spheral")
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")

build_result = sexe("{0} {1} --build . -j 48 --target install".format(ml_cmd, cmake_cmd), echo=True)

# If our build or install failed, run again to get our first error.
if build_result != 0:
print(build_result)
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
print("Compilation failed")
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
sys.exit(1)
sexe("{0} {1} --build . -j 48 --target install".format(ml_cmd, cmake_cmd), echo=True, ret_output=False)

if __name__ == "__main__":
main()
44 changes: 3 additions & 41 deletions scripts/devtools/tpl-manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
import argparse
import os
import sys
import subprocess
import json

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from spheralutils import sexe

#------------------------------------------------------------------------------

project_dir=os.path.abspath(os.path.join(os.path.realpath(__file__), "../../../"))
Expand Down Expand Up @@ -64,46 +66,6 @@ def parse_args():

return parser.parse_args()


# Helper function for executing commands stolen from uberenv
def sexe(cmd,ret_output=False,echo=True):
""" Helper for executing shell commands. """
if echo:
print("[exe: {0}]".format(cmd))

# If we want to return the output as string a print to stdout
# in real-time we need to let subprocess print as normal to
# PIPE and STDOUT. We then need to read it back ourselves and
# append to an ouput string of our own making. There is no way
# to do this with subprocess currently.
if ret_output:
p = subprocess.Popen(cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
encoding='utf8')
out = "";
while True:
realtime_output = p.stdout.readline()

if realtime_output == '' and p.poll() is not None:
break

if realtime_output:
print(realtime_output.strip(), flush=True)
out += realtime_output

if echo:
print(out)
return out

# If we do not need to return the output as a string, run()
# will suffice.
else:
p = subprocess.run(cmd, shell=True,
check=True, text=True)


# Parse the json formatted spec list...
def parse_spec_list(file_path):
with open(file_path) as f:
Expand Down
22 changes: 5 additions & 17 deletions scripts/gitlab/build_and_install.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
import os
import sys
import argparse
import subprocess

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from spheralutils import sexe

#------------------------------------------------------------------------------

Expand Down Expand Up @@ -41,20 +43,6 @@ def parse_args():

return parser.parse_args()


# Helper function for executing commands stolen from uberenv
def sexe(cmd,ret_output=False,echo=True):
""" Helper for executing shell commands. """
if echo:
print("[exe: {0}]".format(cmd))
p = subprocess.run(cmd, shell=True,
capture_output=ret_output,
check=True, text=True)
if ret_output:
if echo:
print(p.stdout)
return p.stdout

#------------------------------------------------------------------------------

def main():
Expand All @@ -68,7 +56,7 @@ def main():

# Get the host-config name and path.
if not args.build_only and not args.host_config:
hostconfig="{1}-{2}.cmake".format(host, sys_type, (args.spec).replace(" ","_"))
hostconfig="{0}-{1}.cmake".format(sys_type, (args.spec).replace(" ","_"))
sexe("cp {0} gitlab.cmake".format(hostconfig))
hostconfig_path=os.path.join(os.getcwd(), "gitlab.cmake")
else:
Expand All @@ -77,7 +65,7 @@ def main():
print(hostconfig)

if not args.tpls_only:
if sexe("{0} --host-config=\"{1}\" --lc-modules=\"{2}\" --build {3}".format(host_congfig_build_cmd, hostconfig_path, args.lc_modules, args.extra_cmake_args)) : sys.exit(1)
sexe("{0} --host-config=\"{1}\" --lc-modules=\"{2}\" --build {3}".format(host_congfig_build_cmd, hostconfig_path, args.lc_modules, args.extra_cmake_args))

if __name__ == "__main__":
main()
12 changes: 0 additions & 12 deletions scripts/gitlab/report_results.py

This file was deleted.

107 changes: 107 additions & 0 deletions scripts/gitlab/run_ats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/usr/bin/env python3

import sys, subprocess, argparse, os

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from spheralutils import sexe

# If the number of failed tests exceeds this value, ATS is not rerun
max_test_failures = 10
# Number of times to rerun the ATS tests
max_reruns = 1

#------------------------------------------------------------------------------

def parse_args():
parser = argparse.ArgumentParser()

# Spec args
parser.add_argument('--test-alloc', type=str, nargs="+",
help='Allocation command for the machine.')
parser.add_argument('--ats-file', type=str,
help='ATS test file to run.')
parser.add_argument('--ci-build-dir', type=str,
help='CI build directory.')
parser.add_argument('--ci-install-dir', type=str,
default="build_gitlab/install",
help="Location of Spheral installation "+\
"relative to --ci-build-dir")
return parser.parse_args()

#------------------------------------------------------------------------------

# Run ats.py to check results and return the number of failed tests
def report_results(output_dir):
ats_py = os.path.join(output_dir, "atsr.py")
if (not os.path.exists(ats_py)):
print(f"{ats_py} does not exists")
sys.exit(1)
exec(compile(open(ats_py).read(), ats_py, 'exec'), globals())
state = globals()["state"]
failed_tests = [t for t in state['testlist'] if t['status'] in [FAILED,TIMEDOUT] ]
if len(failed_tests) > 0:
print(f"ATS failed {len(failed_tests)} tests.")
for t in failed_tests:
print(t['name'])
return len(failed_tests)
else:
print("ATS passed all tests.")
return 0

#------------------------------------------------------------------------------

# Run the tests and check if any failed
def run_and_report(run_command, ci_output, num_runs):
if (num_runs > max_reruns):
print("Exceeded number of ATS reruns")
sys.exit(1)
sexe(run_command)
tests_passed = report_results(ci_output)
if (tests_passed == 0):
if (num_runs > 0):
print("WARNING: Some tests were run multiple times")
sys.exit(0)
# This should be added back in once Jacamar can handle exit codes properly
# if (num_runs == 0):
# sys.exit(0)
# else:
# sys.exit(80)
elif (tests_passed >= max_test_failures):
print("Too many test failures, not rerunning ATS")
sys.exit(1)
else:
rerun_command = run_command
if (num_runs == 0):
ats_cont_file = os.path.join(ci_output, "continue.ats")
if (not os.path.exists(ats_cont_file)):
print(f"{ats_cont_file} not found, ATS cannot be rerun")
sys.exit(1)
rerun_command = f"{run_command} {ats_cont_file}"
print("WARNING: Test failure, rerunning ATS")
run_and_report(rerun_command, ci_output, num_runs + 1)

#------------------------------------------------------------------------------

def run_ats_test(args):
build_gl_dir = os.path.join(args.ci_build_dir, args.ci_install_dir)
ats_file = os.path.join(build_gl_dir, args.ats_file)
if (not os.path.exists(ats_file)):
print(f"{ats_file} does not exists")
sys.exit(1)
lcats_test = os.path.join(build_gl_dir, "spheral-lcatstest")
if (not os.path.exists(lcats_test)):
print(f"{lcats_test} does not exists")
ats_configs = ' --timelimit="45m"'
test_alloc = " ".join(args.test_alloc)
run_command = f"{test_alloc} {lcats_test} --logs test-logs {ats_file} {ats_configs}"
ci_output = os.path.join(args.ci_build_dir, "test-logs")
run_and_report(run_command, ci_output, 0)

#------------------------------------------------------------------------------

def main():
args = parse_args()
run_ats_test(args)

if __name__ == "__main__":
main()
Loading

0 comments on commit aace2ac

Please sign in to comment.