-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #10 from stackhpc/drain-resume
Drain/resume toggle (v0.4.0)
- Loading branch information
Showing
5 changed files
with
139 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,16 @@ | ||
--- | ||
openhpc_slurm_service_enabled: true | ||
openhpc_slurm_service: | ||
openhpc_slurm_control_host: | ||
openhpc_slurm_control_host: "{{ inventory_hostname }}" | ||
openhpc_slurm_partitions: [] | ||
openhpc_cluster_name: | ||
openhpc_packages: [] | ||
openhpc_drain_timeout: 86400 | ||
openhpc_resume_timeout: 300 | ||
openhpc_retry_delay: 10 | ||
openhpc_enable: | ||
control: false | ||
batch: false | ||
runtime: false | ||
drain: false | ||
resume: false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
--- | ||
# Ansible tasks to drain a Slurm compute node. Waits for the compute node to be | ||
# drained for up to a day by default. | ||
# | ||
# Variables: | ||
# - node_to_drain: compute node to drain | ||
# - drain_timeout: seconds to wait for node to drain, default is 86400. | ||
|
||
- name: Get nodes in DRAINED state | ||
command: "sinfo --noheader --Node --format='%N' --states=DRAINED" | ||
register: drained_nodes_results | ||
changed_when: false | ||
|
||
- name: Drain compute node | ||
command: "scontrol update nodename={{ inventory_hostname }} state=DRAIN reason='maintenance'" | ||
when: inventory_hostname not in drained_nodes_results.stdout_lines | ||
|
||
- name: Check node has drained | ||
command: "sinfo --noheader --Node --format='%N' --states=DRAINED" | ||
register: drained_nodes | ||
until: "inventory_hostname in drained_nodes.stdout_lines" | ||
delay: "{{ openhpc_retry_delay }}" | ||
retries: "{{ (openhpc_drain_timeout/openhpc_retry_delay) | int }}" | ||
changed_when: false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,18 @@ | ||
--- | ||
- include: control.yml | ||
when: openhpc_enable.control | bool | ||
when: openhpc_enable.control | default(false) | bool | ||
|
||
- include: compute.yml | ||
when: openhpc_enable.batch | bool | ||
when: openhpc_enable.batch | default(false) | bool | ||
|
||
- include: runtime.yml | ||
when: openhpc_enable.runtime | bool | ||
when: openhpc_enable.runtime | default(false) | bool | ||
|
||
- include: drain.yml | ||
when: openhpc_enable.drain | default(false) | bool | ||
delegate_to: "{{ openhpc_slurm_control_host }}" | ||
|
||
- include: resume.yml | ||
when: openhpc_enable.resume | default(false) | bool | ||
delegate_to: "{{ openhpc_slurm_control_host }}" | ||
... |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
--- | ||
# Ansible tasks to resume a Slurm compute node. Waits for the compute node to | ||
# change state for 5 minutes by default. | ||
# | ||
# Variables: | ||
# - nodes_to_resume: compute node to resume | ||
# - resume_timeout: seconds to wait for node to resume, default is 300. | ||
|
||
- name: Get nodes in ALLOC,IDLE states | ||
command: "sinfo --noheader --Node --format='%N' --states=ALLOC,IDLE" | ||
register: resumed_nodes_results | ||
changed_when: false | ||
|
||
- name: Resume compute node | ||
command: "scontrol update nodename={{ inventory_hostname }} state=RESUME" | ||
when: inventory_hostname not in resumed_nodes_results.stdout_lines | ||
|
||
- name: Check node has resumed | ||
command: "sinfo --noheader --Node --format='%N' --states=ALLOC,IDLE" | ||
register: resumed_nodes | ||
until: "inventory_hostname in resumed_nodes.stdout_lines" | ||
delay: "{{ openhpc_retry_delay }}" | ||
retries: "{{ (openhpc_resume_timeout/openhpc_retry_delay) | int }}" | ||
changed_when: false |