From 4909f0524477b1cda7f9c2bffacfcf4a97939f51 Mon Sep 17 00:00:00 2001 From: Christophe Haen Date: Fri, 30 Aug 2024 15:08:57 +0200 Subject: [PATCH] feat (FTS): make the monitoring batch size an agent option --- src/DIRAC/DataManagementSystem/Agent/FTS3Agent.py | 6 ++++-- src/DIRAC/DataManagementSystem/ConfigTemplate.cfg | 6 ++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/DIRAC/DataManagementSystem/Agent/FTS3Agent.py b/src/DIRAC/DataManagementSystem/Agent/FTS3Agent.py index 4151a3fffb8..4e009158d43 100644 --- a/src/DIRAC/DataManagementSystem/Agent/FTS3Agent.py +++ b/src/DIRAC/DataManagementSystem/Agent/FTS3Agent.py @@ -106,6 +106,8 @@ def __readConf(self): # lifetime of the proxy we download to delegate to FTS self.proxyLifetime = self.am_getOption("ProxyLifetime", PROXY_LIFETIME) + self.jobMonitoringBatchSize = self.am_getOption("JobMonitoringBatchSize", JOB_MONITORING_BATCH_SIZE) + return S_OK() def initialize(self): @@ -318,7 +320,7 @@ def monitorJobsLoop(self): log.info("Getting next batch of jobs to monitor", f"{loopId}/{nbOfLoops}") # get jobs from DB res = self.fts3db.getActiveJobs( - limit=JOB_MONITORING_BATCH_SIZE, lastMonitor=lastMonitor, jobAssignmentTag=self.assignmentTag + limit=self.jobMonitoringBatchSize, lastMonitor=lastMonitor, jobAssignmentTag=self.assignmentTag ) if not res["OK"]: @@ -353,7 +355,7 @@ def monitorJobsLoop(self): # If we got less to monitor than what we asked, # stop looping - if len(activeJobs) < JOB_MONITORING_BATCH_SIZE: + if len(activeJobs) < self.jobMonitoringBatchSize: break # Commit records after each loop self.dataOpSender.concludeSending() diff --git a/src/DIRAC/DataManagementSystem/ConfigTemplate.cfg b/src/DIRAC/DataManagementSystem/ConfigTemplate.cfg index 406f9d90bc7..c4b9636dab2 100644 --- a/src/DIRAC/DataManagementSystem/ConfigTemplate.cfg +++ b/src/DIRAC/DataManagementSystem/ConfigTemplate.cfg @@ -143,6 +143,12 @@ Agents OperationBulkSize = 20 # How many Job we will monitor in one loop JobBulkSize = 20 + # split jobBulkSize in several chunks + # Bigger numbers (like 100) are efficient when there's a single agent + # When there are multiple agents, it may slow down the overall because + # of lock and race conditions + # (This number should of course be smaller or equal than JobBulkSize) + JobMonitoringBatchSize = 20 # Max number of files to go in a single job MaxFilesPerJob = 100 # Max number of attempt per file