Merge pull request #92 from MissionCriticalCloud/fix-storage-maintena…

…nce-mode-rebase Fix storage maintenance mode rebase
MissionCriticalCloud · Oct 12, 2023 · dac0d3a · dac0d3a
2 parents 2111a52 + 7ef23e3
commit dac0d3a
Show file tree

Hide file tree

Showing 13 changed files with 212 additions and 142 deletions.
diff --git a/cosmicops/empty_host.py b/cosmicops/empty_host.py
@@ -17,7 +17,7 @@
 from cosmicops import CosmicOps, RebootAction, logging
 
 
-def empty_host(profile, shutdown, skip_disable, dry_run, host):
+def empty_host(profile, shutdown, skip_disable, dry_run, host, target_host):
     click_log.basic_config()
 
     log_to_slack = True
@@ -35,7 +35,10 @@ def empty_host(profile, shutdown, skip_disable, dry_run, host):
         if not host.disable():
             raise RuntimeError(f"Failed to disable host '{host['name']}'")
 
-    (total, success, failed) = host.empty()
+    if target_host:
+        target_host = co.get_host(name=target_host)
+
+    (total, success, failed) = host.empty(target=target_host)
     result_message = f"Result: {success} successful, {failed} failed out of {total} total VMs"
 
     if not failed and shutdown:

diff --git a/cosmicops/objects/cluster.py b/cosmicops/objects/cluster.py
@@ -63,6 +63,9 @@ def find_migration_host(self, vm):
             if vm_on_dedicated_hv and not host['dedicated']:
                 continue
 
+            if not vm_on_dedicated_hv and host['dedicated']:
+                continue
+
             if vm_on_dedicated_hv and host['affinitygroupid'] != dedicated_affinity_id:
                 continue
 

diff --git a/cosmicops/objects/host.py b/cosmicops/objects/host.py
@@ -421,13 +421,20 @@ def wait_until_online(self):
         else:
             logging.info(f"Waiting for '{self['name']}' to come back online", self.log_to_slack)
             with click_spinner.spinner():
-                while True:
-                    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-                        s.settimeout(5)
-                        result = s.connect_ex((self['name'], 22))
-
-                    if result == 0:
-                        break
+                # adding retry tests, so we need to be able to connect to SSH three times in one minute
+                # before we consider the host up
+                tests = 1
+                logging.info(f"Waiting for SSH connection, attempt {tests} of 3", False)
+                while tests <= 3:
+                    while True:
+                        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                            s.settimeout(5)
+                            result = s.connect_ex((self['name'], 22))
+
+                        if result == 0:
+                            break
+                    time.sleep(20)
+                    tests += 1
 
         if self.dry_run:
             logging.info(f"Would wait for libvirt on '{self['name']}'")
@@ -467,10 +474,10 @@ def wait_for_agent(self):
 
                 time.sleep(5)
 
-    def get_disks(self, vm):
+    def get_disks(self, vm_instancename):
         lv = libvirt.openReadOnly(f"qemu+tcp://{self['name']}/system")
 
-        domain = lv.lookupByName(vm['instancename'])
+        domain = lv.lookupByName(vm_instancename)
 
         tree = ElementTree.fromstring(domain.XMLDesc())
         block_devs = tree.findall('devices/disk')
@@ -483,6 +490,9 @@ def get_disks(self, vm):
 
             dev = disk.find('target').get('dev')
             full_path = disk.find('source').get('file')
+            if full_path is None:
+                logging.info(f"Skipping disk without a file (NVMe?)")
+                continue
             _, _, pool, path = full_path.split('/')
 
             size, _, _ = domain.blockInfo(dev)
@@ -498,24 +508,24 @@ def get_disks(self, vm):
 
         return disk_data
 
-    def get_domjobinfo(self, vm):
+    def get_domjobinfo(self, vm_instancename):
         try:
             lv = libvirt.openReadOnly(f"qemu+tcp://{self['name']}/system")
             all_domains = lv.listAllDomains()
-            if any([x for x in all_domains if x.name() == vm]):
-                domain = lv.lookupByName(vm)
+            if any([x for x in all_domains if x.name() == vm_instancename]):
+                domain = lv.lookupByName(vm_instancename)
                 domjobinfo = domain.jobInfo()
                 return DomJobInfo.from_list(domjobinfo)
         except libvirt.libvirtError as _:
             pass  # Ignore exception
         return DomJobInfo()
 
-    def get_domjobstats(self, vm, correction=True):
+    def get_domjobstats(self, vm_instancename, correction=True):
         try:
             lv = libvirt.openReadOnly(f"qemu+tcp://{self['name']}/system")
             all_domains = lv.listAllDomains()
-            if any([x for x in all_domains if x.name() == vm]):
-                domain = lv.lookupByName(vm)
+            if any([x for x in all_domains if x.name() == vm_instancename]):
+                domain = lv.lookupByName(vm_instancename)
                 domjobstats = domain.jobStats()
                 memory_total = domjobstats.get('memory_total', 0)
                 if correction:
@@ -541,14 +551,14 @@ def get_domjobstats(self, vm, correction=True):
             pass  # Ignore exception
         return DomJobInfo()
 
-    def get_blkjobinfo(self, vm, volume):
+    def get_blkjobinfo(self, vm_instancename, volume):
         try:
-            disks = self.get_disks(vm)
+            disks = self.get_disks(vm_instancename)
             disk = dict(filter(lambda x: x[0] == volume, disks.items()))
             lv = libvirt.openReadOnly(f"qemu+tcp://{self['name']}/system")
             all_domains = lv.listAllDomains()
-            if any([x for x in all_domains if x.name() == vm['instancename']]):
-                domain = lv.lookupByName(vm['instancename'])
+            if any([x for x in all_domains if x.name() == vm_instancename]):
+                domain = lv.lookupByName(vm_instancename)
                 blkjobinfo = domain.blockJobInfo(disk[volume]['dev'], 0)
                 return BlkJobInfo(
                     jobType=blkjobinfo.get('type', 0),
@@ -560,27 +570,27 @@ def get_blkjobinfo(self, vm, volume):
             pass  # Ignore exception
         return BlkJobInfo()
 
-    def set_iops_limit(self, vm, max_iops):
+    def set_iops_limit(self, vm_instancename, max_iops):
         command = f"""
-        for i in $(/usr/bin/virsh domblklist --details '{vm['name']}' | grep disk | grep file | /usr/bin/awk '{{print $3}}'); do
-            /usr/bin/virsh blkdeviotune '{vm['name']}' $i --total-iops-sec {max_iops} --live
+        for i in $(/usr/bin/virsh domblklist --details '{vm_instancename}' | grep disk | grep file | /usr/bin/awk '{{print $3}}'); do
+            /usr/bin/virsh blkdeviotune '{vm_instancename}' $i --total-iops-sec {max_iops} --live
         done
         """
 
         if not self.execute(command, sudo=True).return_code == 0:
-            logging.error(f"Failed to set IOPS limit for '{vm['name']}'")
+            logging.error(f"Failed to set IOPS limit for '{vm_instancename}'")
             return False
         else:
             return True
 
-    def merge_backing_files(self, vm):
+    def merge_backing_files(self, vm_instancename):
         command = f"""
-        for i in $(/usr/bin/virsh domblklist --details '{vm['instancename']}' | grep disk | grep file | /usr/bin/awk '{{print $3}}'); do
-            /usr/bin/virsh blockpull '{vm['instancename']}' $i --wait --verbose
+        for i in $(/usr/bin/virsh domblklist --details '{vm_instancename}' | grep disk | grep file | /usr/bin/awk '{{print $3}}'); do
+            /usr/bin/virsh blockpull '{vm_instancename}' $i --wait --verbose
         done
         """
         if not self.execute(command, sudo=True).return_code == 0:
-            logging.error(f"Failed to merge backing volumes for '{vm['name']}'")
+            logging.error(f"Failed to merge backing volumes for '{vm_instancename}'")
             return False
         else:
             return True

diff --git a/cosmicops/ops.py b/cosmicops/ops.py
@@ -283,7 +283,7 @@ def wait_for_vm_migration_job(self, job_id, retries=10, domjobinfo=True, source_
             print()
         return status
 
-    def wait_for_volume_migration_job(self, volume_id, job_id, blkjobinfo=True, source_host=None, vm=None):
+    def wait_for_volume_migration_job(self, volume_id, job_id, blkjobinfo=True, source_host=None, vm_instancename=None):
         prev_percentage = 0.
 
         # Hack to wait for job to start
@@ -294,8 +294,8 @@ def wait_for_volume_migration_job(self, volume_id, job_id, blkjobinfo=True, sour
                 logging.error(f"Error: Could not find volume '{volume_id}'")
                 return False
 
-            if blkjobinfo and source_host and vm:
-                blkjobinfo = source_host.get_blkjobinfo(vm, volume['path'])
+            if blkjobinfo and source_host and vm_instancename:
+                blkjobinfo = source_host.get_blkjobinfo(vm_instancename, volume['path'])
                 cur_percentage = float(blkjobinfo.current / (blkjobinfo.end or 1) * 100)
                 if cur_percentage > prev_percentage:
                     prev_percentage = cur_percentage
@@ -308,7 +308,7 @@ def wait_for_volume_migration_job(self, volume_id, job_id, blkjobinfo=True, sour
             logging.debug(f"Volume '{volume_id}' is in {volume['state']} state and not Ready. Sleeping.")
         # Return result of job
         status = self.wait_for_job(job_id=job_id, retries=1)
-        if blkjobinfo and source_host and vm and status:
+        if blkjobinfo and source_host and vm_instancename and status:
             print("100%       ")
         else:
             print()

diff --git a/empty_host.py b/empty_host.py
@@ -28,9 +28,10 @@
 @click.option('--shutdown', is_flag=True, help='Shutdown host when all VMs have been migrated')
 @click.option('--skip-disable', is_flag=True, help='Do not disable host before emptying it')
 @click.option('--dry-run/--exec', is_flag=True, default=True, show_default=True, help='Enable/disable dry-run')
+@click.option('--target-host', help='Target hypervisor the migrate VMS to', required=False)
 @click_log.simple_verbosity_option(logging.getLogger(), default="INFO", show_default=True)
 @click.argument('host')
-def main(profile, shutdown, skip_disable, dry_run, host):
+def main(profile, shutdown, skip_disable, dry_run, target_host, host):
     """Empty HOST by migrating VMs to another host in the same cluster."""
 
     click_log.basic_config()
@@ -39,7 +40,7 @@ def main(profile, shutdown, skip_disable, dry_run, host):
         logging.info('Running in dry-run mode, will only show changes')
 
     try:
-        logging.info(empty_host(profile, shutdown, skip_disable, dry_run, host))
+        logging.info(empty_host(profile, shutdown, skip_disable, dry_run, host, target_host))
     except RuntimeError as err:
         logging.error(err)
         sys.exit(1)