Skip to content

Commit

Permalink
Redo volumes,disks,images cleanup
Browse files Browse the repository at this point in the history
Initially when data storage cleanup logic was created
we wanted to make it very flexable from one side
and very coutious from another. After more than 2 years
of practical usage of the tool we can clearly state that
half of implemented features don't have real use case. This commit
is dropping not needed functionality keeping only what necessary
  • Loading branch information
asmorodskyi committed Nov 29, 2022
1 parent a2a5db6 commit 0d29c7e
Show file tree
Hide file tree
Showing 15 changed files with 350 additions and 820 deletions.
1 change: 1 addition & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
omit =
*/lib/python*
*/migrations/*
*/tests/*
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ test:
flake8 webui
flake8 ocw
flake8 manage.py
pytest --cov=./
pytest --cov

.PHONY: codecov
codecov:
pytest -v --cov=./ --cov-report=html && xdg-open htmlcov/index.html
pytest -v --cov --cov-report=html && xdg-open htmlcov/index.html

# Build containers
docker-container:
Expand Down
120 changes: 21 additions & 99 deletions ocw/lib/EC2.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from .provider import Provider, Image
from .provider import Provider
from webui.settings import PCWConfig, ConfigFile
from dateutil.parser import parse
import boto3
from botocore.exceptions import ClientError
import re
from datetime import date, datetime, timedelta, timezone
from ocw.lib.emailnotify import send_mail
import traceback
Expand Down Expand Up @@ -89,45 +88,33 @@ def all_clusters(self):
return clusters

@staticmethod
def needs_to_delete_snapshot(snapshot, cleanup_ec2_max_snapshot_age_days) -> bool:
delete_older_than = date.today() - timedelta(days=cleanup_ec2_max_snapshot_age_days)
if datetime.date(snapshot['StartTime']) < delete_older_than:
regexes = [
re.compile(r'''^OpenQA upload image$'''),
re.compile(r'''^Created by CreateImage\([\w-]+\) for ami-\w+ from vol-\w+$''')
]
for regex in regexes:
m = re.match(regex, snapshot['Description'].strip())
if m:
return True
return False
def needs_to_delete_by_age(creation_time, cleanup_ec2_max_age_days) -> bool:
return datetime.date(creation_time) < (date.today() - timedelta(days=cleanup_ec2_max_age_days))

def cleanup_snapshots(self, cleanup_ec2_max_snapshot_age_days):
for region in self.all_regions:
response = self.ec2_client(region).describe_snapshots(OwnerIds=['self'])
response['Snapshots'].sort(key=lambda snapshot: snapshot['StartTime'].timestamp())
for snapshot in response['Snapshots']:
if EC2.needs_to_delete_snapshot(snapshot, cleanup_ec2_max_snapshot_age_days):
self.log_info("Deleting snapshot {} in region {} with StartTime={}", snapshot['SnapshotId'],
region, snapshot['StartTime'])
if EC2.needs_to_delete_by_age(snapshot['StartTime'], cleanup_ec2_max_snapshot_age_days):
try:
if self.dry_run:
self.log_info("Snapshot deletion of {} skipped due to dry run mode",
snapshot['SnapshotId'])
else:
self.log_info("Deleting snapshot {} in region {} with StartTime={}",
snapshot['SnapshotId'], region, snapshot['StartTime'])
self.ec2_client(region).delete_snapshot(SnapshotId=snapshot['SnapshotId'])
except ClientError as ex:
if ex.response['Error']['Code'] == 'InvalidSnapshot.InUse':
self.log_info(ex.response['Error']['Message'])
else:
raise ex

def cleanup_volumes(self, cleanup_ec2_max_volumes_age_days):
delete_older_than = date.today() - timedelta(days=cleanup_ec2_max_volumes_age_days)
def cleanup_volumes(self, cleanup_ec2_max_age_days):
for region in self.all_regions:
response = self.ec2_client(region).describe_volumes()
for volume in response['Volumes']:
if datetime.date(volume['CreateTime']) < delete_older_than:
if EC2.needs_to_delete_by_age(volume['CreateTime'], cleanup_ec2_max_age_days):
if self.volume_protected(volume):
self.log_info('Volume {} has tag DO_NOT_DELETE so protected from deletion',
volume['VolumeId'])
Expand Down Expand Up @@ -209,66 +196,13 @@ def delete_all_clusters(self):
self.log_info("Finally deleting {} cluster", cluster)
self.eks_client(region).delete_cluster(name=cluster)

def parse_image_name(self, img_name):
regexes = [
# openqa-SLES12-SP5-EC2.x86_64-0.9.1-BYOS-Build1.55.raw.xz
re.compile(r'''^openqa-SLES
(?P<version>\d+(-SP\d+)?)
-(?P<flavor>EC2)
\.
(?P<arch>[^-]+)
-
(?P<kiwi>\d+\.\d+\.\d+)
-
(?P<type>(BYOS|On-Demand))
-Build
(?P<build>\d+\.\d+)
\.raw\.xz
''', re.RegexFlag.X),
# openqa-SLES15-SP2.x86_64-0.9.3-EC2-HVM-Build1.10.raw.xz'
# openqa-SLES15-SP2-BYOS.x86_64-0.9.3-EC2-HVM-Build1.10.raw.xz'
# openqa-SLES15-SP2.aarch64-0.9.3-EC2-HVM-Build1.49.raw.xz'
# openqa-SLES15-SP4-SAP-BYOS.x86_64-0.9.3-EC2-Build150400.1.31.raw.xz
re.compile(r'''^openqa-SLES
(?P<version>\d+(-SP\d+)?)
(-(?P<type>[^\.]+))?
\.
(?P<arch>[^-]+)
-
(?P<kiwi>\d+\.\d+\.\d+)
-
(?P<flavor>EC2[-\w]*)
-Build(\d+\.)?
(?P<build>\d+\.\d+)
\.raw\.xz
''', re.RegexFlag.X),
# openqa-SLES12-SP4-EC2-HVM-BYOS.x86_64-0.9.2-Build2.56.raw.xz'
re.compile(r'''^openqa-SLES
(?P<version>\d+(-SP\d+)?)
-
(?P<flavor>EC2[^\.]+)
\.
(?P<arch>[^-]+)
-
(?P<kiwi>\d+\.\d+\.\d+)
-
Build
(?P<build>\d+\.\d+)
\.raw\.xz
''', re.RegexFlag.X)
]
return self.parse_image_name_helper(img_name, regexes)

def cleanup_all(self):
cleanup_ec2_max_snapshot_age_days = PCWConfig.get_feature_property('cleanup', 'ec2-max-snapshot-age-days',
self._namespace)
cleanup_ec2_max_volumes_age_days = PCWConfig.get_feature_property('cleanup', 'ec2-max-volumes-age-days',
self._namespace)
self.cleanup_images()
if cleanup_ec2_max_snapshot_age_days >= 0:
self.cleanup_snapshots(cleanup_ec2_max_snapshot_age_days)
if cleanup_ec2_max_volumes_age_days >= 0:
self.cleanup_volumes(cleanup_ec2_max_volumes_age_days)
cleanup_ec2_max_age_days = PCWConfig.get_feature_property('cleanup', 'ec2-max-age-days', self._namespace)

if cleanup_ec2_max_age_days >= 0:
self.cleanup_images(cleanup_ec2_max_age_days)
self.cleanup_volumes(cleanup_ec2_max_age_days)
self.cleanup_snapshots(cleanup_ec2_max_age_days)
if PCWConfig.getBoolean('cleanup/vpc_cleanup', self._namespace):
self.cleanup_uploader_vpcs()

Expand Down Expand Up @@ -389,25 +323,13 @@ def cleanup_uploader_vpcs(self):
region)
send_mail('VPC deletion locked by running VMs', body)

def cleanup_images(self):
def cleanup_images(self, cleanup_ec2_max_age_days):
for region in self.all_regions:
response = self.ec2_client(region).describe_images(Owners=['self'])
images = list()
for img in response['Images']:
# img is in the format described here:
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.Client.describe_images
m = self.parse_image_name(img['Name'])
if m:
self.log_dbg("Image {} is candidate for deletion with build {}", img['Name'], m['build'])
images.append(
Image(img['Name'], flavor=m['key'], build=m['build'], date=parse(img['CreationDate']),
img_id=img['ImageId']))
else:
self.log_err(" Unable to parse image name '{}'", img['Name'])
keep_images = self.get_keeping_image_names(images)
for img in [i for i in images if i.name not in keep_images]:
self.log_dbg("Delete image '{}' (ami:{})".format(img.name, img.id))
if self.dry_run:
self.log_info("Image deletion {} skipped due to dry run mode", img.id)
else:
self.ec2_client(region).deregister_image(ImageId=img.id, DryRun=False)
if EC2.needs_to_delete_by_age(parse(img['CreationDate']), cleanup_ec2_max_age_days):
if self.dry_run:
self.log_info("Image deletion {} skipped due to dry run mode", img['ImageId'])
else:
self.log_dbg("Delete image '{}' (ami:{})".format(img['Name'], img['ImageId']))
self.ec2_client(region).deregister_image(ImageId=img['ImageId'], DryRun=False)
147 changes: 32 additions & 115 deletions ocw/lib/azure.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
from .provider import Provider, Image
from .provider import Provider
from webui.settings import PCWConfig
from azure.identity import ClientSecretCredential
from azure.mgmt.resource import ResourceManagementClient
from azure.mgmt.compute import ComputeManagementClient
from azure.mgmt.storage import StorageManagementClient
from azure.storage.blob import BlobServiceClient
from msrest.exceptions import AuthenticationError
import re
import time
from typing import Dict

Expand Down Expand Up @@ -100,126 +99,44 @@ def list_disks_by_resource_group(self, resource_group):

def list_by_resource_group(self, resource_group, filters=None):
return [item for item in self.resource_mgmt_client().resources.list_by_resource_group(
resource_group, filter=filters)]

def get_keeping_image_names(self):
images = list()
for item in self.container_client('sle-images').list_blobs():
m = self.parse_image_name(item.name)
if m:
images.append(Image(item.name, flavor=m['key'], build=m['build'], date=item.last_modified))
else:
self.log_err("Unable to parse image name '{}'", item.name)

return super().get_keeping_image_names(images)
resource_group, filter=filters, expand="changedTime")]

def cleanup_all(self):
''' Cleanup all autodateed data which might created during automated tests.'''
self.cleanup_bootdiagnostics()

keep_images = self.get_keeping_image_names()
self.cleanup_sle_images_container(keep_images)
self.cleanup_disks_from_rg(keep_images)
self.cleanup_images_from_rg(keep_images)
for i in keep_images:
self.log_info("Keep image {} ", i)

def cleanup_bootdiagnostics(self):
containers = self.bs_client().list_containers()
self.cleanup_blob_containers()
self.cleanup_images_from_rg()
self.cleanup_disks_from_rg()

def cleanup_blob_containers(self):
containers = self.bs_client().list_containers(include_metadata=True)
for c in containers:
self.log_dbg('Found container {}', c.name)
if (re.match('^bootdiagnostics-', c.name)):
self.cleanup_bootdiagnostics_container(c)

def cleanup_bootdiagnostics_container(self, container):
latest_modification = container.last_modified
container_blobs = self.container_client(container.name).list_blobs()
for blob in container_blobs:
if (latest_modification > blob.last_modified):
latest_modification = blob.last_modified
if (self.older_than_min_age(latest_modification)):
self.log_info("Mark container for deletion {}", container.name)
if self.dry_run:
self.log_info("Deletion of boot diagnostic container {} skipped due to dry run mode", container.name)
else:
self.bs_client().delete_container(container.name)

def parse_image_name(self, img_name):
regexes = [
# SLES12-SP5-Azure.x86_64-0.9.1-SAP-BYOS-Build3.3.vhd
re.compile(r"""
SLES
(?P<version>\d+(-SP\d+)?)
-Azure\.
(?P<arch>[^-]+)
-
(?P<kiwi>\d+\.\d+\.\d+)
-
(?P<flavor>[-\w]+)
-
Build(?P<build>\d+\.\d+)
\.vhd
""",
re.X),

# SLES15-SP2-BYOS.x86_64-0.9.3-Azure-Build1.10.vhd
# SLES15-SP2.x86_64-0.9.3-Azure-Basic-Build1.11.vhd
# SLES15-SP2-SAP-BYOS.x86_64-0.9.2-Azure-Build1.9.vhd
# SLES15-SP4-BYOS.x86_64-0.9.1-Azure-Build150400.2.103.vhd
re.compile(r"""
SLES
(?P<version>\d+(-SP\d+)?)
(-(?P<type>[^\.]+))?\.
(?P<arch>[^-]+)
-
(?P<kiwi>\d+\.\d+\.\d+)
(-(?P<flavor>Azure[-\w]*))?
-
Build(\d+\.)?(?P<build>\d+\.\d+)
\.vhd
""",
re.X)
]
return self.parse_image_name_helper(img_name, regexes)

def cleanup_sle_images_container(self, keep_images):
container_client = self.container_client('sle-images')
for img in container_client.list_blobs():
m = self.parse_image_name(img.name)
if m:
self.log_dbg('Blob {} is candidate for deletion with build {} ', img.name, m['build'])

if img.name not in keep_images:
self.log_info("Delete blob '{}'", img.name)
if self.dry_run:
self.log_info("Deletion of blob image {} skipped due to dry run mode", img.name)
else:
container_client.delete_blob(img.name, delete_snapshots="include")
if 'pcw_ignore' not in c['metadata']:
self.log_dbg('Found container {}', c.name)
container_blobs = self.container_client(c.name).list_blobs()
for blob in container_blobs:
if (self.older_than_max_age_hours(blob.last_modified)):
if self.dry_run:
self.log_info("Deletion of blob {} skipped due to dry run mode", blob.name)
else:
self.log_info("Deleting blob {}", blob.name)
self.container_client(c.name).delete_blob(blob.name, delete_snapshots="include")

def cleanup_images_from_rg(self, keep_images):
def cleanup_images_from_rg(self):
for item in self.list_images_by_resource_group(self.__resource_group):
m = self.parse_image_name(item.name)
if m:
self.log_dbg('Image {} is candidate for deletion with build {} ', item.name, m['build'])
if item.name not in keep_images:
if self.older_than_max_age_hours(item.changed_time):
if self.dry_run:
self.log_info("Deletion of image {} skipped due to dry run mode", item.name)
else:
self.log_info("Delete image '{}'", item.name)
if self.dry_run:
self.log_info("Deletion of image {} skipped due to dry run mode", item.name)
else:
self.compute_mgmt_client().images.begin_delete(self.__resource_group, item.name)
self.compute_mgmt_client().images.begin_delete(self.__resource_group, item.name)

def cleanup_disks_from_rg(self, keep_images):
def cleanup_disks_from_rg(self):
for item in self.list_disks_by_resource_group(self.__resource_group):
m = self.parse_image_name(item.name)
if m:
self.log_dbg('Disk {} is candidate for deletion with build {} ', item.name, m['build'])

if item.name not in keep_images:
if self.compute_mgmt_client().disks.get(self.__resource_group, item.name).managed_by:
self.log_warn("Disk is in use - unable delete {}", item.name)
if self.older_than_max_age_hours(item.changed_time):
if self.compute_mgmt_client().disks.get(self.__resource_group, item.name).managed_by:
self.log_warn("Disk is in use - unable delete {}", item.name)
else:
if self.dry_run:
self.log_info("Deletion of disk {} skipped due to dry run mode", item.name)
else:
self.log_info("Delete disk '{}'", item.name)
if self.dry_run:
self.log_info("Deletion of image {} skipped due to dry run mode", item.name)
else:
self.compute_mgmt_client().disks.begin_delete(self.__resource_group, item.name)
self.compute_mgmt_client().disks.begin_delete(self.__resource_group, item.name)
Loading

0 comments on commit 0d29c7e

Please sign in to comment.