Skip to content

Commit

Permalink
Merge pull request #10 from stackhpc/drain-resume
Browse files Browse the repository at this point in the history
Drain/resume toggle (v0.4.0)
  • Loading branch information
brtkwr authored May 21, 2019
2 parents 3eadb46 + 54509da commit 3df7f67
Show file tree
Hide file tree
Showing 5 changed files with 139 additions and 21 deletions.
91 changes: 74 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,42 @@ Role Variables

`openhpc_packages`: additional OpenHPC packages to install

`openhpc_enable`:
`openhpc_enable`:
* `control`: whether to enable control host
* `batch`: whether to enable compute nodes
* `batch`: whether to enable compute nodes
* `runtime`: whether to enable OpenHPC runtime
* `drain`: whether to drain compute nodes
* `resume`: whether to resume compute nodes

Example Playbook
Example Inventory
-----------------

And an Ansible inventory as this:

[openhpc_login]
openhpc-login-0 ansible_host=10.60.253.40 ansible_user=centos

[openhpc_compute]
openhpc-compute-0 ansible_host=10.60.253.31 ansible_user=centos
openhpc-compute-1 ansible_host=10.60.253.32 ansible_user=centos

[cluster_login:children]
openhpc_login

[cluster_control:children]
openhpc_login

[cluster_batch:children]
openhpc_compute

Example Playbooks
----------------

To deploy, create a playbook which looks like this:

---
- hosts:
- cluster_login
- cluster_control
- cluster_batch
become: yes
Expand All @@ -53,19 +77,52 @@ To deploy, create a playbook which looks like this:
openhpc_packages: []
...

Example Inventory
-----------------

And an Ansible inventory as this:

[openhpc_login]
openhpc-login-0 ansible_host=10.60.253.40 ansible_user=centos

[openhpc_compute]
openhpc-compute-0 ansible_host=10.60.253.33 ansible_user=centos
To drain nodes, for example, before scaling down the cluster to 6 nodes:

[cluster_control:children]
openhpc_login
---
- hosts: openstack
gather_facts: false
vars:
partition: "{{ cluster_group.output_value | selectattr('group', 'equalto', item.name) | list }}"
openhpc_slurm_partitions:
- name: "compute"
flavor: "compute-A"
image: "CentOS7.5-OpenHPC"
num_nodes: 6
user: "centos"
openhpc_cluster_name: openhpc
roles:
# Our stackhpc.cluster-infra role can be invoked in `query` mode which
# looks up the state of the cluster by querying the Heat API.
- role: stackhpc.cluster-infra
cluster_name: "{{ cluster_name }}"
cluster_state: query
cluster_params:
cluster_groups: "{{ cluster_groups }}"
tasks:
# Given that the original cluster that was created had 8 nodes and the
# cluster we want to create has 6 nodes, the computed desired_state
# variable stores the list of instances to leave untouched.
- name: Count the number of compute nodes per slurm partition
set_fact:
desired_state: "{{ (( partition | first).nodes | map(attribute='name') | list )[:item.num_nodes] + desired_state | default([]) }}"
when: partition | length > 0
with_items: "{{ openhpc_slurm_partitions }}"
- debug: var=desired_state

- hosts: cluster_batch
become: yes
vars:
desired_state: "{{ hostvars['localhost']['desired_state'] | default([]) }}"
roles:
# Now, the stackhpc.openhpc role is invoked in drain/resume modes where
# the instances in desired_state are resumed if in a drained state and
# drained if in a resumed state.
- role: stackhpc.openhpc
openhpc_slurm_control_host: "{{ groups['cluster_control'] | first }}"
openhpc_enable:
drain: "{{ inventory_hostname not in desired_state }}"
resume: "{{ inventory_hostname in desired_state }}"
...

[cluster_batch:children]
openhpc_compute
7 changes: 6 additions & 1 deletion defaults/main.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
---
openhpc_slurm_service_enabled: true
openhpc_slurm_service:
openhpc_slurm_control_host:
openhpc_slurm_control_host: "{{ inventory_hostname }}"
openhpc_slurm_partitions: []
openhpc_cluster_name:
openhpc_packages: []
openhpc_drain_timeout: 86400
openhpc_resume_timeout: 300
openhpc_retry_delay: 10
openhpc_enable:
control: false
batch: false
runtime: false
drain: false
resume: false
24 changes: 24 additions & 0 deletions tasks/drain.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
---
# Ansible tasks to drain a Slurm compute node. Waits for the compute node to be
# drained for up to a day by default.
#
# Variables:
# - node_to_drain: compute node to drain
# - drain_timeout: seconds to wait for node to drain, default is 86400.

- name: Get nodes in DRAINED state
command: "sinfo --noheader --Node --format='%N' --states=DRAINED"
register: drained_nodes_results
changed_when: false

- name: Drain compute node
command: "scontrol update nodename={{ inventory_hostname }} state=DRAIN reason='maintenance'"
when: inventory_hostname not in drained_nodes_results.stdout_lines

- name: Check node has drained
command: "sinfo --noheader --Node --format='%N' --states=DRAINED"
register: drained_nodes
until: "inventory_hostname in drained_nodes.stdout_lines"
delay: "{{ openhpc_retry_delay }}"
retries: "{{ (openhpc_drain_timeout/openhpc_retry_delay) | int }}"
changed_when: false
14 changes: 11 additions & 3 deletions tasks/main.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
---
- include: control.yml
when: openhpc_enable.control | bool
when: openhpc_enable.control | default(false) | bool

- include: compute.yml
when: openhpc_enable.batch | bool
when: openhpc_enable.batch | default(false) | bool

- include: runtime.yml
when: openhpc_enable.runtime | bool
when: openhpc_enable.runtime | default(false) | bool

- include: drain.yml
when: openhpc_enable.drain | default(false) | bool
delegate_to: "{{ openhpc_slurm_control_host }}"

- include: resume.yml
when: openhpc_enable.resume | default(false) | bool
delegate_to: "{{ openhpc_slurm_control_host }}"
...
24 changes: 24 additions & 0 deletions tasks/resume.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
---
# Ansible tasks to resume a Slurm compute node. Waits for the compute node to
# change state for 5 minutes by default.
#
# Variables:
# - nodes_to_resume: compute node to resume
# - resume_timeout: seconds to wait for node to resume, default is 300.

- name: Get nodes in ALLOC,IDLE states
command: "sinfo --noheader --Node --format='%N' --states=ALLOC,IDLE"
register: resumed_nodes_results
changed_when: false

- name: Resume compute node
command: "scontrol update nodename={{ inventory_hostname }} state=RESUME"
when: inventory_hostname not in resumed_nodes_results.stdout_lines

- name: Check node has resumed
command: "sinfo --noheader --Node --format='%N' --states=ALLOC,IDLE"
register: resumed_nodes
until: "inventory_hostname in resumed_nodes.stdout_lines"
delay: "{{ openhpc_retry_delay }}"
retries: "{{ (openhpc_resume_timeout/openhpc_retry_delay) | int }}"
changed_when: false

0 comments on commit 3df7f67

Please sign in to comment.