Skip to content

Commit

Permalink
Merge pull request #130 from TACC/dev-v3
Browse files Browse the repository at this point in the history
#129 - Fix health issue
  • Loading branch information
NotChristianGarcia authored May 17, 2024
2 parents d7a84f0 + 2c4d59d commit b8ece86
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 11 deletions.
22 changes: 13 additions & 9 deletions actors/health.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,28 +271,32 @@ def check_workers(actor_id, ttl):
except:
logger.error("Time received for TTL measurements is not of type datetime.")
last_execution = datetime.datetime.min
if last_execution + datetime.timedelta(seconds=ttl) < datetime.datetime.utcnow():
worker_end_time = last_execution + datetime.timedelta(seconds=ttl)
now_time = datetime.datetime.utcnow()
if worker_end_time < now_time:
# shutdown worker
logger.info("Shutting down worker beyond ttl.")
shutdown_worker(actor_id, worker_id)
else:
logger.info("Still time left for this worker.")
logger.info(f"Worker is within ttl. Leaving worker. worker_end_time: {worker_end_time} < current_time: {now_time}")

if worker['status'] == codes.ERROR:
# shutdown worker
logger.info("Shutting down worker in error status.")
shutdown_worker(actor_id, worker_id)

# Ensure the worker container still exists on the correct host_id. Workers can be deleted after restarts or crashes.

### Ensure the worker container still exists on the correct host_id. Workers can be deleted after restarts or crashes.
# Kubernetes container names have to be completely lowercase.
if conf.container_backend == 'kubernetes':
db_worker_id = worker_id.lower()
else:
db_worker_id = worker_id

worker_container_found = False
if worker['host_id'] == conf.spawner_host_id and worker['status'] == 'READY':
try:
for container in worker_containers:
# Kubernete container names have to be completely lowercase.
if conf.container_backend == 'kubernetes':
db_worker_id = worker_id.lower()
else:
db_worker_id = worker_id
if db_worker_id in container['worker_id']:
worker_container_found = True
break
Expand Down Expand Up @@ -450,4 +454,4 @@ def main():
check_workers_store(ttl)

if __name__ == '__main__':
main()
main()
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ cryptography==3.4.7
pytest
ipython
kubernetes
tapipy>=1.2.20
tapisservice>=1.3.0
tapipy==1.5.0
tapisservice==1.5.0

# we can migrate to a higher version of cloudpickle as soon as we upgrade the samples images (e.g. abacosamples/py3_func:dev)
# currently, differences in the patch version withing 0.5 of cloudpickle (e.g. 0.5.2 vs 0.5.6) causes compatibility issues.
Expand Down

0 comments on commit b8ece86

Please sign in to comment.