Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Redo volumes,disks,images cleanup #171

Merged
merged 1 commit into from
Dec 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
omit =
*/lib/python*
*/migrations/*
*/tests/*
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ test:
flake8 webui
flake8 ocw
flake8 manage.py
pytest --cov=./
pytest --cov

.PHONY: codecov
codecov:
pytest -v --cov=./ --cov-report=html && xdg-open htmlcov/index.html
pytest -v --cov --cov-report=html && xdg-open htmlcov/index.html

# Build containers
docker-container:
Expand Down
126 changes: 24 additions & 102 deletions ocw/lib/EC2.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from .provider import Provider, Image
from .provider import Provider
from webui.settings import PCWConfig, ConfigFile
from dateutil.parser import parse
import boto3
from botocore.exceptions import ClientError
import re
from datetime import date, datetime, timedelta, timezone
from ocw.lib.emailnotify import send_mail
import traceback
Expand Down Expand Up @@ -89,47 +88,35 @@ def all_clusters(self):
return clusters

@staticmethod
def needs_to_delete_snapshot(snapshot, cleanup_ec2_max_snapshot_age_days) -> bool:
delete_older_than = date.today() - timedelta(days=cleanup_ec2_max_snapshot_age_days)
if datetime.date(snapshot['StartTime']) < delete_older_than:
regexes = [
re.compile(r'''^OpenQA upload image$'''),
re.compile(r'''^Created by CreateImage\([\w-]+\) for ami-\w+ from vol-\w+$''')
]
for regex in regexes:
m = re.match(regex, snapshot['Description'].strip())
if m:
return True
return False
def is_outdated(creation_time, valid_period_days) -> bool:
return datetime.date(creation_time) < (date.today() - timedelta(days=valid_period_days))

def cleanup_snapshots(self, cleanup_ec2_max_snapshot_age_days):
def cleanup_snapshots(self, valid_period_days):
for region in self.all_regions:
response = self.ec2_client(region).describe_snapshots(OwnerIds=['self'])
response['Snapshots'].sort(key=lambda snapshot: snapshot['StartTime'].timestamp())
for snapshot in response['Snapshots']:
if EC2.needs_to_delete_snapshot(snapshot, cleanup_ec2_max_snapshot_age_days):
self.log_info("Deleting snapshot {} in region {} with StartTime={}", snapshot['SnapshotId'],
region, snapshot['StartTime'])
if EC2.is_outdated(snapshot['StartTime'], valid_period_days):
try:
if self.dry_run:
self.log_info("Snapshot deletion of {} skipped due to dry run mode",
snapshot['SnapshotId'])
else:
self.log_info("Deleting snapshot {} in region {} with StartTime={}",
snapshot['SnapshotId'], region, snapshot['StartTime'])
self.ec2_client(region).delete_snapshot(SnapshotId=snapshot['SnapshotId'])
except ClientError as ex:
if ex.response['Error']['Code'] == 'InvalidSnapshot.InUse':
self.log_info(ex.response['Error']['Message'])
else:
raise ex

def cleanup_volumes(self, cleanup_ec2_max_volumes_age_days):
delete_older_than = date.today() - timedelta(days=cleanup_ec2_max_volumes_age_days)
def cleanup_volumes(self, valid_period_days):
for region in self.all_regions:
response = self.ec2_client(region).describe_volumes()
for volume in response['Volumes']:
if datetime.date(volume['CreateTime']) < delete_older_than:
if EC2.is_outdated(volume['CreateTime'], valid_period_days):
if self.volume_protected(volume):
grisu48 marked this conversation as resolved.
Show resolved Hide resolved
self.log_info('Volume {} has tag DO_NOT_DELETE so protected from deletion',
self.log_info('Volume {} has tag pcw_ignore so protected from deletion',
volume['VolumeId'])
elif self.dry_run:
self.log_info("Volume deletion of {} skipped due to dry run mode", volume['VolumeId'])
Expand All @@ -147,7 +134,7 @@ def cleanup_volumes(self, cleanup_ec2_max_volumes_age_days):
def volume_protected(self, volume):
if 'Tags' in volume:
for tag in volume['Tags']:
if tag['Key'] == 'DO_NOT_DELETE':
if tag['Key'] == 'pcw_ignore':
return True
return False

Expand Down Expand Up @@ -209,66 +196,13 @@ def delete_all_clusters(self):
self.log_info("Finally deleting {} cluster", cluster)
self.eks_client(region).delete_cluster(name=cluster)

def parse_image_name(self, img_name):
regexes = [
# openqa-SLES12-SP5-EC2.x86_64-0.9.1-BYOS-Build1.55.raw.xz
re.compile(r'''^openqa-SLES
(?P<version>\d+(-SP\d+)?)
-(?P<flavor>EC2)
\.
(?P<arch>[^-]+)
-
(?P<kiwi>\d+\.\d+\.\d+)
-
(?P<type>(BYOS|On-Demand))
-Build
(?P<build>\d+\.\d+)
\.raw\.xz
''', re.RegexFlag.X),
# openqa-SLES15-SP2.x86_64-0.9.3-EC2-HVM-Build1.10.raw.xz'
# openqa-SLES15-SP2-BYOS.x86_64-0.9.3-EC2-HVM-Build1.10.raw.xz'
# openqa-SLES15-SP2.aarch64-0.9.3-EC2-HVM-Build1.49.raw.xz'
# openqa-SLES15-SP4-SAP-BYOS.x86_64-0.9.3-EC2-Build150400.1.31.raw.xz
re.compile(r'''^openqa-SLES
(?P<version>\d+(-SP\d+)?)
(-(?P<type>[^\.]+))?
\.
(?P<arch>[^-]+)
-
(?P<kiwi>\d+\.\d+\.\d+)
-
(?P<flavor>EC2[-\w]*)
-Build(\d+\.)?
(?P<build>\d+\.\d+)
\.raw\.xz
''', re.RegexFlag.X),
# openqa-SLES12-SP4-EC2-HVM-BYOS.x86_64-0.9.2-Build2.56.raw.xz'
re.compile(r'''^openqa-SLES
(?P<version>\d+(-SP\d+)?)
-
(?P<flavor>EC2[^\.]+)
\.
(?P<arch>[^-]+)
-
(?P<kiwi>\d+\.\d+\.\d+)
-
Build
(?P<build>\d+\.\d+)
\.raw\.xz
''', re.RegexFlag.X)
]
return self.parse_image_name_helper(img_name, regexes)

def cleanup_all(self):
cleanup_ec2_max_snapshot_age_days = PCWConfig.get_feature_property('cleanup', 'ec2-max-snapshot-age-days',
self._namespace)
cleanup_ec2_max_volumes_age_days = PCWConfig.get_feature_property('cleanup', 'ec2-max-volumes-age-days',
self._namespace)
self.cleanup_images()
if cleanup_ec2_max_snapshot_age_days >= 0:
self.cleanup_snapshots(cleanup_ec2_max_snapshot_age_days)
if cleanup_ec2_max_volumes_age_days >= 0:
self.cleanup_volumes(cleanup_ec2_max_volumes_age_days)
valid_period_days = PCWConfig.get_feature_property('cleanup', 'ec2-max-age-days', self._namespace)

if valid_period_days > 0:
self.cleanup_images(valid_period_days)
self.cleanup_snapshots(valid_period_days)
self.cleanup_volumes(valid_period_days)
if PCWConfig.getBoolean('cleanup/vpc_cleanup', self._namespace):
self.cleanup_uploader_vpcs()

Expand Down Expand Up @@ -389,25 +323,13 @@ def cleanup_uploader_vpcs(self):
region)
send_mail('VPC deletion locked by running VMs', body)

def cleanup_images(self):
def cleanup_images(self, valid_period_days):
for region in self.all_regions:
response = self.ec2_client(region).describe_images(Owners=['self'])
images = list()
for img in response['Images']:
# img is in the format described here:
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.Client.describe_images
m = self.parse_image_name(img['Name'])
if m:
self.log_dbg("Image {} is candidate for deletion with build {}", img['Name'], m['build'])
images.append(
Image(img['Name'], flavor=m['key'], build=m['build'], date=parse(img['CreationDate']),
img_id=img['ImageId']))
else:
self.log_err(" Unable to parse image name '{}'", img['Name'])
keep_images = self.get_keeping_image_names(images)
for img in [i for i in images if i.name not in keep_images]:
self.log_dbg("Delete image '{}' (ami:{})".format(img.name, img.id))
if self.dry_run:
self.log_info("Image deletion {} skipped due to dry run mode", img.id)
else:
self.ec2_client(region).deregister_image(ImageId=img.id, DryRun=False)
if EC2.is_outdated(parse(img['CreationDate']), valid_period_days):
if self.dry_run:
self.log_info("Image deletion {} skipped due to dry run mode", img['ImageId'])
else:
self.log_dbg("Delete image '{}' (ami:{})".format(img['Name'], img['ImageId']))
self.ec2_client(region).deregister_image(ImageId=img['ImageId'], DryRun=False)
164 changes: 50 additions & 114 deletions ocw/lib/azure.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .provider import Provider, Image
from .provider import Provider
from webui.settings import PCWConfig
from azure.identity import ClientSecretCredential
from azure.mgmt.resource import ResourceManagementClient
Expand Down Expand Up @@ -100,126 +100,62 @@ def list_disks_by_resource_group(self, resource_group):

def list_by_resource_group(self, resource_group, filters=None):
return [item for item in self.resource_mgmt_client().resources.list_by_resource_group(
resource_group, filter=filters)]

def get_keeping_image_names(self):
images = list()
for item in self.container_client('sle-images').list_blobs():
m = self.parse_image_name(item.name)
if m:
images.append(Image(item.name, flavor=m['key'], build=m['build'], date=item.last_modified))
else:
self.log_err("Unable to parse image name '{}'", item.name)

return super().get_keeping_image_names(images)
resource_group, filter=filters, expand="changedTime")]

def cleanup_all(self):
''' Cleanup all autodateed data which might created during automated tests.'''
self.cleanup_bootdiagnostics()

keep_images = self.get_keeping_image_names()
self.cleanup_sle_images_container(keep_images)
self.cleanup_disks_from_rg(keep_images)
self.cleanup_images_from_rg(keep_images)
for i in keep_images:
self.log_info("Keep image {} ", i)

def cleanup_bootdiagnostics(self):
containers = self.bs_client().list_containers()
self.cleanup_images_from_rg()
self.cleanup_disks_from_rg()
self.cleanup_blob_containers()

@staticmethod
def container_valid_for_cleanup(container):
'''
under term "container" we meant Azure Blob Storage Container.
See https://learn.microsoft.com/en-us/azure/storage/blobs/storage-blobs-introduction
for more details
Container is valid for cleanup if it met 2 conditions :
1. "metadata" of container does not contain special tag (pcw_ignore)
2. Container name or contains "bootdiagnostics-" in its name or its name is "sle-images"
'''
if 'pcw_ignore' in container['metadata']:
return False
if re.match('^bootdiagnostics-', container.name):
return True
if container.name == 'sle-images':
return True
return False

def cleanup_blob_containers(self):
containers = self.bs_client().list_containers(include_metadata=True)
for c in containers:
self.log_dbg('Found container {}', c.name)
if (re.match('^bootdiagnostics-', c.name)):
self.cleanup_bootdiagnostics_container(c)

def cleanup_bootdiagnostics_container(self, container):
latest_modification = container.last_modified
container_blobs = self.container_client(container.name).list_blobs()
for blob in container_blobs:
if (latest_modification > blob.last_modified):
latest_modification = blob.last_modified
if (self.older_than_min_age(latest_modification)):
self.log_info("Mark container for deletion {}", container.name)
if self.dry_run:
self.log_info("Deletion of boot diagnostic container {} skipped due to dry run mode", container.name)
else:
self.bs_client().delete_container(container.name)

def parse_image_name(self, img_name):
regexes = [
# SLES12-SP5-Azure.x86_64-0.9.1-SAP-BYOS-Build3.3.vhd
re.compile(r"""
SLES
(?P<version>\d+(-SP\d+)?)
-Azure\.
(?P<arch>[^-]+)
-
(?P<kiwi>\d+\.\d+\.\d+)
-
(?P<flavor>[-\w]+)
-
Build(?P<build>\d+\.\d+)
\.vhd
""",
re.X),

# SLES15-SP2-BYOS.x86_64-0.9.3-Azure-Build1.10.vhd
# SLES15-SP2.x86_64-0.9.3-Azure-Basic-Build1.11.vhd
# SLES15-SP2-SAP-BYOS.x86_64-0.9.2-Azure-Build1.9.vhd
# SLES15-SP4-BYOS.x86_64-0.9.1-Azure-Build150400.2.103.vhd
re.compile(r"""
SLES
(?P<version>\d+(-SP\d+)?)
(-(?P<type>[^\.]+))?\.
(?P<arch>[^-]+)
-
(?P<kiwi>\d+\.\d+\.\d+)
(-(?P<flavor>Azure[-\w]*))?
-
Build(\d+\.)?(?P<build>\d+\.\d+)
\.vhd
""",
re.X)
]
return self.parse_image_name_helper(img_name, regexes)

def cleanup_sle_images_container(self, keep_images):
container_client = self.container_client('sle-images')
for img in container_client.list_blobs():
m = self.parse_image_name(img.name)
if m:
self.log_dbg('Blob {} is candidate for deletion with build {} ', img.name, m['build'])

if img.name not in keep_images:
self.log_info("Delete blob '{}'", img.name)
if self.dry_run:
self.log_info("Deletion of blob image {} skipped due to dry run mode", img.name)
else:
container_client.delete_blob(img.name, delete_snapshots="include")
if Azure.container_valid_for_cleanup(c):
self.log_dbg('Found container {}', c.name)
container_blobs = self.container_client(c.name).list_blobs()
asmorodskyi marked this conversation as resolved.
Show resolved Hide resolved
for blob in container_blobs:
if (self.is_outdated(blob.last_modified)):
if self.dry_run:
self.log_info("Deletion of blob {} skipped due to dry run mode", blob.name)
else:
self.log_info("Deleting blob {}", blob.name)
self.container_client(c.name).delete_blob(blob.name, delete_snapshots="include")

def cleanup_images_from_rg(self, keep_images):
def cleanup_images_from_rg(self):
for item in self.list_images_by_resource_group(self.__resource_group):
m = self.parse_image_name(item.name)
if m:
self.log_dbg('Image {} is candidate for deletion with build {} ', item.name, m['build'])
if item.name not in keep_images:
if self.is_outdated(item.changed_time):
if self.dry_run:
self.log_info("Deletion of image {} skipped due to dry run mode", item.name)
else:
self.log_info("Delete image '{}'", item.name)
if self.dry_run:
self.log_info("Deletion of image {} skipped due to dry run mode", item.name)
else:
self.compute_mgmt_client().images.begin_delete(self.__resource_group, item.name)
self.compute_mgmt_client().images.begin_delete(self.__resource_group, item.name)

def cleanup_disks_from_rg(self, keep_images):
def cleanup_disks_from_rg(self):
for item in self.list_disks_by_resource_group(self.__resource_group):
m = self.parse_image_name(item.name)
if m:
self.log_dbg('Disk {} is candidate for deletion with build {} ', item.name, m['build'])

if item.name not in keep_images:
if self.compute_mgmt_client().disks.get(self.__resource_group, item.name).managed_by:
self.log_warn("Disk is in use - unable delete {}", item.name)
if self.is_outdated(item.changed_time):
if self.compute_mgmt_client().disks.get(self.__resource_group, item.name).managed_by:
self.log_warn("Disk is in use - unable delete {}", item.name)
else:
if self.dry_run:
self.log_info("Deletion of disk {} skipped due to dry run mode", item.name)
else:
self.log_info("Delete disk '{}'", item.name)
if self.dry_run:
self.log_info("Deletion of image {} skipped due to dry run mode", item.name)
else:
self.compute_mgmt_client().disks.begin_delete(self.__resource_group, item.name)
self.compute_mgmt_client().disks.begin_delete(self.__resource_group, item.name)
Loading