Skip to content

Commit

Permalink
Merge pull request #92 from MissionCriticalCloud/fix-storage-maintena…
Browse files Browse the repository at this point in the history
…nce-mode-rebase

Fix storage maintenance mode rebase
  • Loading branch information
neubauerf authored Oct 12, 2023
2 parents 2111a52 + 7ef23e3 commit dac0d3a
Show file tree
Hide file tree
Showing 13 changed files with 212 additions and 142 deletions.
7 changes: 5 additions & 2 deletions cosmicops/empty_host.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from cosmicops import CosmicOps, RebootAction, logging


def empty_host(profile, shutdown, skip_disable, dry_run, host):
def empty_host(profile, shutdown, skip_disable, dry_run, host, target_host):
click_log.basic_config()

log_to_slack = True
Expand All @@ -35,7 +35,10 @@ def empty_host(profile, shutdown, skip_disable, dry_run, host):
if not host.disable():
raise RuntimeError(f"Failed to disable host '{host['name']}'")

(total, success, failed) = host.empty()
if target_host:
target_host = co.get_host(name=target_host)

(total, success, failed) = host.empty(target=target_host)
result_message = f"Result: {success} successful, {failed} failed out of {total} total VMs"

if not failed and shutdown:
Expand Down
3 changes: 3 additions & 0 deletions cosmicops/objects/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ def find_migration_host(self, vm):
if vm_on_dedicated_hv and not host['dedicated']:
continue

if not vm_on_dedicated_hv and host['dedicated']:
continue

if vm_on_dedicated_hv and host['affinitygroupid'] != dedicated_affinity_id:
continue

Expand Down
64 changes: 37 additions & 27 deletions cosmicops/objects/host.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,13 +421,20 @@ def wait_until_online(self):
else:
logging.info(f"Waiting for '{self['name']}' to come back online", self.log_to_slack)
with click_spinner.spinner():
while True:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(5)
result = s.connect_ex((self['name'], 22))

if result == 0:
break
# adding retry tests, so we need to be able to connect to SSH three times in one minute
# before we consider the host up
tests = 1
logging.info(f"Waiting for SSH connection, attempt {tests} of 3", False)
while tests <= 3:
while True:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(5)
result = s.connect_ex((self['name'], 22))

if result == 0:
break
time.sleep(20)
tests += 1

if self.dry_run:
logging.info(f"Would wait for libvirt on '{self['name']}'")
Expand Down Expand Up @@ -467,10 +474,10 @@ def wait_for_agent(self):

time.sleep(5)

def get_disks(self, vm):
def get_disks(self, vm_instancename):
lv = libvirt.openReadOnly(f"qemu+tcp://{self['name']}/system")

domain = lv.lookupByName(vm['instancename'])
domain = lv.lookupByName(vm_instancename)

tree = ElementTree.fromstring(domain.XMLDesc())
block_devs = tree.findall('devices/disk')
Expand All @@ -483,6 +490,9 @@ def get_disks(self, vm):

dev = disk.find('target').get('dev')
full_path = disk.find('source').get('file')
if full_path is None:
logging.info(f"Skipping disk without a file (NVMe?)")
continue
_, _, pool, path = full_path.split('/')

size, _, _ = domain.blockInfo(dev)
Expand All @@ -498,24 +508,24 @@ def get_disks(self, vm):

return disk_data

def get_domjobinfo(self, vm):
def get_domjobinfo(self, vm_instancename):
try:
lv = libvirt.openReadOnly(f"qemu+tcp://{self['name']}/system")
all_domains = lv.listAllDomains()
if any([x for x in all_domains if x.name() == vm]):
domain = lv.lookupByName(vm)
if any([x for x in all_domains if x.name() == vm_instancename]):
domain = lv.lookupByName(vm_instancename)
domjobinfo = domain.jobInfo()
return DomJobInfo.from_list(domjobinfo)
except libvirt.libvirtError as _:
pass # Ignore exception
return DomJobInfo()

def get_domjobstats(self, vm, correction=True):
def get_domjobstats(self, vm_instancename, correction=True):
try:
lv = libvirt.openReadOnly(f"qemu+tcp://{self['name']}/system")
all_domains = lv.listAllDomains()
if any([x for x in all_domains if x.name() == vm]):
domain = lv.lookupByName(vm)
if any([x for x in all_domains if x.name() == vm_instancename]):
domain = lv.lookupByName(vm_instancename)
domjobstats = domain.jobStats()
memory_total = domjobstats.get('memory_total', 0)
if correction:
Expand All @@ -541,14 +551,14 @@ def get_domjobstats(self, vm, correction=True):
pass # Ignore exception
return DomJobInfo()

def get_blkjobinfo(self, vm, volume):
def get_blkjobinfo(self, vm_instancename, volume):
try:
disks = self.get_disks(vm)
disks = self.get_disks(vm_instancename)
disk = dict(filter(lambda x: x[0] == volume, disks.items()))
lv = libvirt.openReadOnly(f"qemu+tcp://{self['name']}/system")
all_domains = lv.listAllDomains()
if any([x for x in all_domains if x.name() == vm['instancename']]):
domain = lv.lookupByName(vm['instancename'])
if any([x for x in all_domains if x.name() == vm_instancename]):
domain = lv.lookupByName(vm_instancename)
blkjobinfo = domain.blockJobInfo(disk[volume]['dev'], 0)
return BlkJobInfo(
jobType=blkjobinfo.get('type', 0),
Expand All @@ -560,27 +570,27 @@ def get_blkjobinfo(self, vm, volume):
pass # Ignore exception
return BlkJobInfo()

def set_iops_limit(self, vm, max_iops):
def set_iops_limit(self, vm_instancename, max_iops):
command = f"""
for i in $(/usr/bin/virsh domblklist --details '{vm['name']}' | grep disk | grep file | /usr/bin/awk '{{print $3}}'); do
/usr/bin/virsh blkdeviotune '{vm['name']}' $i --total-iops-sec {max_iops} --live
for i in $(/usr/bin/virsh domblklist --details '{vm_instancename}' | grep disk | grep file | /usr/bin/awk '{{print $3}}'); do
/usr/bin/virsh blkdeviotune '{vm_instancename}' $i --total-iops-sec {max_iops} --live
done
"""

if not self.execute(command, sudo=True).return_code == 0:
logging.error(f"Failed to set IOPS limit for '{vm['name']}'")
logging.error(f"Failed to set IOPS limit for '{vm_instancename}'")
return False
else:
return True

def merge_backing_files(self, vm):
def merge_backing_files(self, vm_instancename):
command = f"""
for i in $(/usr/bin/virsh domblklist --details '{vm['instancename']}' | grep disk | grep file | /usr/bin/awk '{{print $3}}'); do
/usr/bin/virsh blockpull '{vm['instancename']}' $i --wait --verbose
for i in $(/usr/bin/virsh domblklist --details '{vm_instancename}' | grep disk | grep file | /usr/bin/awk '{{print $3}}'); do
/usr/bin/virsh blockpull '{vm_instancename}' $i --wait --verbose
done
"""
if not self.execute(command, sudo=True).return_code == 0:
logging.error(f"Failed to merge backing volumes for '{vm['name']}'")
logging.error(f"Failed to merge backing volumes for '{vm_instancename}'")
return False
else:
return True
Expand Down
8 changes: 4 additions & 4 deletions cosmicops/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def wait_for_vm_migration_job(self, job_id, retries=10, domjobinfo=True, source_
print()
return status

def wait_for_volume_migration_job(self, volume_id, job_id, blkjobinfo=True, source_host=None, vm=None):
def wait_for_volume_migration_job(self, volume_id, job_id, blkjobinfo=True, source_host=None, vm_instancename=None):
prev_percentage = 0.

# Hack to wait for job to start
Expand All @@ -294,8 +294,8 @@ def wait_for_volume_migration_job(self, volume_id, job_id, blkjobinfo=True, sour
logging.error(f"Error: Could not find volume '{volume_id}'")
return False

if blkjobinfo and source_host and vm:
blkjobinfo = source_host.get_blkjobinfo(vm, volume['path'])
if blkjobinfo and source_host and vm_instancename:
blkjobinfo = source_host.get_blkjobinfo(vm_instancename, volume['path'])
cur_percentage = float(blkjobinfo.current / (blkjobinfo.end or 1) * 100)
if cur_percentage > prev_percentage:
prev_percentage = cur_percentage
Expand All @@ -308,7 +308,7 @@ def wait_for_volume_migration_job(self, volume_id, job_id, blkjobinfo=True, sour
logging.debug(f"Volume '{volume_id}' is in {volume['state']} state and not Ready. Sleeping.")
# Return result of job
status = self.wait_for_job(job_id=job_id, retries=1)
if blkjobinfo and source_host and vm and status:
if blkjobinfo and source_host and vm_instancename and status:
print("100% ")
else:
print()
Expand Down
5 changes: 3 additions & 2 deletions empty_host.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,10 @@
@click.option('--shutdown', is_flag=True, help='Shutdown host when all VMs have been migrated')
@click.option('--skip-disable', is_flag=True, help='Do not disable host before emptying it')
@click.option('--dry-run/--exec', is_flag=True, default=True, show_default=True, help='Enable/disable dry-run')
@click.option('--target-host', help='Target hypervisor the migrate VMS to', required=False)
@click_log.simple_verbosity_option(logging.getLogger(), default="INFO", show_default=True)
@click.argument('host')
def main(profile, shutdown, skip_disable, dry_run, host):
def main(profile, shutdown, skip_disable, dry_run, target_host, host):
"""Empty HOST by migrating VMs to another host in the same cluster."""

click_log.basic_config()
Expand All @@ -39,7 +40,7 @@ def main(profile, shutdown, skip_disable, dry_run, host):
logging.info('Running in dry-run mode, will only show changes')

try:
logging.info(empty_host(profile, shutdown, skip_disable, dry_run, host))
logging.info(empty_host(profile, shutdown, skip_disable, dry_run, host, target_host))
except RuntimeError as err:
logging.error(err)
sys.exit(1)
Expand Down
Loading

0 comments on commit dac0d3a

Please sign in to comment.