diff --git a/dlrover/python/common/global_context.py b/dlrover/python/common/global_context.py index 5b679066a..c12453ed0 100644 --- a/dlrover/python/common/global_context.py +++ b/dlrover/python/common/global_context.py @@ -51,6 +51,7 @@ class DefaultValues(object): SEC_TO_CHANGE_PS = 3600 # 1h SEC_TO_WAIT_FAILED_PS = 600 # 10min HANG_CPU_USAGE_RATE = 0.05 + HANG_DETECTION = 1 class Context(Singleton): @@ -92,6 +93,9 @@ def __init__(self): self.is_tfv1_ps = False self.master_port = None self.relaunch_always = False + # The strategy of 'hang detection': + # 0: log only; 1: notify; 2: with fault tolerance + self.hang_detection = DefaultValues.HANG_DETECTION def set_params_from_brain(self): self.train_speed_record_num = self.get_param_value_from_brain( diff --git a/dlrover/python/diagnosis/common/constants.py b/dlrover/python/diagnosis/common/constants.py index 601bb730d..85bec03e4 100644 --- a/dlrover/python/diagnosis/common/constants.py +++ b/dlrover/python/diagnosis/common/constants.py @@ -32,7 +32,9 @@ class DiagnosisDataType(object): XPU_TIMER_METRIC = "XPU_TIMER_METRIC" -class DiagnosisAction(object): +class DiagnosisActionType(object): NO_ACTION = "no_action" RESTART_WORKER = "restart_worker" RELAUNCH_WORKER = "relaunch_worker" + EVENT = "event" + MASTER_RELAUNCH_WORKER = "master_relaunch_worker" diff --git a/dlrover/python/diagnosis/common/diagnose_action.py b/dlrover/python/diagnosis/common/diagnose_action.py deleted file mode 100644 index ea96de464..000000000 --- a/dlrover/python/diagnosis/common/diagnose_action.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2024 The DLRover Authors. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import List - - -class DiagnoseAction: - def __init__(self): - self._actions: List[str] = [] - - def add_action(self, action: str): - self._actions.append(action) diff --git a/dlrover/python/diagnosis/common/diagnosis_action.py b/dlrover/python/diagnosis/common/diagnosis_action.py new file mode 100644 index 000000000..98bb723be --- /dev/null +++ b/dlrover/python/diagnosis/common/diagnosis_action.py @@ -0,0 +1,106 @@ +# Copyright 2024 The DLRover Authors. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List + +from dlrover.python.diagnosis.common.constants import DiagnosisActionType + + +class DiagnosisAction: + """ + The action describes the expect operation after diagnostician. + The action can be consumed by the master's job manager or directly used + in training node. + """ + + def __init__( + self, + diagnosis_type: DiagnosisActionType = DiagnosisActionType.NO_ACTION, + action_config={}, + ): + """ + Args: + diagnosis_type (DiagnosisActionType): The action type. + """ + + self._diagnosis_type = diagnosis_type + self._action_config = action_config + + @property + def diagnosis_type(self): + return self._diagnosis_type + + @property + def action_config(self): + return self._action_config + + +class EventAction(DiagnosisAction): + """Output the specified event.""" + + def __init__( + self, + event_type: str = "", + instance: str = "", + action: str = "", + msg: str = "", + labels: Dict[str, str] = {}, + ): + super().__init__(DiagnosisActionType.EVENT) + self._event_type = event_type + self._instance = instance + self._action = action + self._msg = msg + self._labels = labels + + @property + def event_type(self): + return self._event_type + + @property + def instance(self): + return self._instance + + @property + def action(self): + return self._action + + @property + def msg(self): + return self._msg + + @property + def labels(self): + return self._labels + + +class NodeRelaunchAction(DiagnosisAction): + """Relaunch the specified node.""" + + def __init__(self, node_id, node_status, reason): + super().__init__(DiagnosisActionType.MASTER_RELAUNCH_WORKER) + self._node_id = node_id + self._node_status = node_status + self._reason = reason + + @property + def node_id(self): + return self._node_id + + @property + def node_status(self): + return self._node_status + + @property + def reason(self): + return self._reason diff --git a/dlrover/python/diagnosis/inferencechain/coordinator.py b/dlrover/python/diagnosis/inferencechain/coordinator.py index 07cb70326..f092fa95a 100644 --- a/dlrover/python/diagnosis/inferencechain/coordinator.py +++ b/dlrover/python/diagnosis/inferencechain/coordinator.py @@ -13,9 +13,9 @@ from typing import List -from dlrover.python.diagnosis.common.diagnose_action import DiagnoseAction +from dlrover.python.diagnosis.common.diagnosis_action import DiagnosisAction from dlrover.python.diagnosis.common.inference_chain import Inference -def coordinate_inferences(observations: List[Inference]) -> DiagnoseAction: - return DiagnoseAction() +def coordinate_inferences(observations: List[Inference]) -> DiagnosisAction: + return DiagnosisAction() diff --git a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py b/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py index 1bcfd0c55..8a9175743 100644 --- a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py +++ b/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py @@ -11,8 +11,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List +import re +import sys +from typing import Dict, List, Tuple +from dlrover.python.common.global_context import Context +from dlrover.python.common.log import default_logger as logger from dlrover.python.diagnosis.common.constants import DiagnosisDataType from dlrover.python.diagnosis.common.diagnosis_data import DiagnosisData from dlrover.python.diagnosis.common.inference_chain import ( @@ -24,6 +28,7 @@ ) HANG_METRIC_PREFIX = "XPU_TIMER_COMMON_HANG" +_dlrover_ctx = Context.singleton_instance() class CheckTrainingHangOperator(InferenceOperator): @@ -46,7 +51,14 @@ def is_compatible(self, inference: Inference) -> bool: return False def infer(self, inferences: List[Inference]) -> List[Inference]: - if not self.data_manager: + if ( + not self.data_manager + or not self.data_manager.with_runtime_context() + ): + logger.info( + "Skip training-hang inference for there is " + "no diagnosis data reference." + ) return [ Inference( name=InferenceName.TRAINING, @@ -60,6 +72,7 @@ def infer(self, inferences: List[Inference]) -> List[Inference]: ) if diagnosis_data and self.is_hang(diagnosis_data): + logger.warning("Training might hanged.") return [ Inference( name=InferenceName.TRAINING, @@ -77,17 +90,112 @@ def infer(self, inferences: List[Inference]) -> List[Inference]: ] def is_hang(self, diagnosis_data: List[DiagnosisData]): - hang_metric = [] + logger.info( + "Hang detection start using diagnosis data, " + f"data number: {len(diagnosis_data)}, " + f"data size: {sys.getsizeof(diagnosis_data)}." + ) + worker_hang_metric: Dict[int, List[Tuple[int, bool]]] = {} if not diagnosis_data: return False for data in diagnosis_data: + # filter hang metric each_metric = [ line for line in data.data_content.splitlines() if line.startswith(HANG_METRIC_PREFIX) ] - hang_metric.append(each_metric) - # TODO: implement the judgement + # if all local rank is hanged, tag worker hang + rank_hang_size = 0 + is_worker_hang = False + for each_rank_metric in each_metric: + match = re.search(r"(\d+)(?!.*\d)", each_rank_metric) + if match and match.group(0) == "1": + rank_hang_size += 1 + if rank_hang_size == len(each_metric): + is_worker_hang = True + + if data.node_rank not in worker_hang_metric: + worker_hang_metric[data.node_rank] = [] + worker_hang_metric[data.node_rank].append( + (data.timestamp, is_worker_hang) + ) + + # hang detection rules: + # 1. 100% worker got hang metric + # 2. last for 5+ minutes + hang_id, hang_last = self._find_hang_intersection(worker_hang_metric) + hang_last_threshold = self._get_hang_time_last_threshold() + if hang_id != -1 and hang_last > hang_last_threshold: + logger.info( + f"Got hang worker: {hang_id}, time last: {hang_last}, " + f"threshold: {hang_last_threshold}" + ) + if _dlrover_ctx.hang_detection == 1: + # TODO + pass + elif _dlrover_ctx.hang_detection == 2: + # TODO + pass + return True + return False + + def _get_hang_time_last_threshold(self): + # set 5 minutes for now(second) + return 5 * 60 + + def _find_hang_intersection( + self, worker_hang_metric: Dict[int, List[Tuple[int, bool]]] + ) -> Tuple[int, int]: + """ + Require all workers hang from latest and find the hang intersection. + + Args: + worker_hang_metric (Dict[int, List[Tuple[int, bool]]]): Input + metric in format: node_id: [(timestamp, is_hang), ...] + + Returns: + The hang intersection's id and time last in tuple format. + """ + + worker_hang_length_min = 0 + worker_hang_id = -1 + + # find the intersection from latest + for worker_id, tuple_list in worker_hang_metric.items(): + # sorted by timestamp + tuple_list.sort(key=lambda x: x[0]) + worker_hang_length = 0 + + for tuple_item in reversed(tuple_list): + if tuple_item[1]: + worker_hang_length += 1 + else: + break + + if worker_hang_length > 0: + if worker_hang_length_min == 0: + worker_hang_length_min = worker_hang_length + worker_hang_id = worker_id + elif worker_hang_length < worker_hang_length_min: + worker_hang_length_min = worker_hang_length + worker_hang_id = worker_id + else: + # there is normal worker + return -1, -1 + + # get the intersection's time last + if worker_hang_id != -1 and worker_hang_length_min != 0: + hang_worker_metric = worker_hang_metric[worker_hang_id] + time_last = ( + hang_worker_metric[len(hang_worker_metric) - 1][0] + - hang_worker_metric[ + len(hang_worker_metric) - worker_hang_length_min + ][0] + ) + return worker_hang_id, time_last + + return -1, -1 diff --git a/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py b/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py index 7b1619829..85054a4f3 100644 --- a/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py +++ b/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py @@ -25,11 +25,11 @@ from dlrover.python.common.singleton import Singleton from dlrover.python.common.worker import WorkerContext from dlrover.python.diagnosis.common.constants import ( - DiagnosisAction, + DiagnosisActionType, DiagnosisConstant, InferenceConfigKey, ) -from dlrover.python.diagnosis.common.diagnose_action import DiagnoseAction +from dlrover.python.diagnosis.common.diagnosis_action import DiagnosisAction from dlrover.python.diagnosis.common.diagnosis_data import WorkerTrainingMetric from dlrover.python.diagnosis.common.inference_chain import ( Inference, @@ -105,7 +105,7 @@ def _observe(self) -> List[Inference]: def _diagnose_observations( self, observations: List[Inference] - ) -> DiagnoseAction: + ) -> DiagnosisAction: conclusions: List[Inference] = [] for ob in observations: ic = InferenceChain([ob], self._diagnosis_operators) @@ -165,7 +165,7 @@ def diagnose_training_failure(self, worker_context: WorkerContext) -> str: f"{worker_context.worker_spec.max_restarts} " f"attempts left; will restart worker group." ) - return DiagnosisAction.RESTART_WORKER + return DiagnosisActionType.RESTART_WORKER else: logger.info( f"[{worker_context.worker_spec.role}] Worker group " @@ -174,7 +174,7 @@ def diagnose_training_failure(self, worker_context: WorkerContext) -> str: f"no attempts({worker_context.worker_spec.max_restarts}) " "left; will relaunch." ) - return DiagnosisAction.RELAUNCH_WORKER + return DiagnosisActionType.RELAUNCH_WORKER def _report_failure_to_master( self, failures: Dict[int, ProcessFailure], restart_count: int diff --git a/dlrover/python/elastic_agent/torch/training.py b/dlrover/python/elastic_agent/torch/training.py index 29eebd7d4..6a760820c 100644 --- a/dlrover/python/elastic_agent/torch/training.py +++ b/dlrover/python/elastic_agent/torch/training.py @@ -88,7 +88,7 @@ ) from dlrover.python.common.log import default_logger as logger from dlrover.python.common.worker import WorkerContext -from dlrover.python.diagnosis.common.constants import DiagnosisAction +from dlrover.python.diagnosis.common.constants import DiagnosisActionType from dlrover.python.elastic_agent.config.paral_config_tuner import ( ParalConfigTuner, ) @@ -852,6 +852,7 @@ def _invoke_run(self, role: str = DEFAULT_ROLE) -> RunResult: logger.warning(f"Unexpected exception when ending: {e}") finally: self._client.report_succeeded() + logger.info("Succeeded and exit.") return run_result elif state in {WorkerState.UNHEALTHY, WorkerState.FAILED}: @@ -871,9 +872,9 @@ def _invoke_run(self, role: str = DEFAULT_ROLE) -> RunResult: except Exception as e: logger.warning(f"Failed to diagnose errors: {e}") if self._remaining_failovers > 0: - action = DiagnosisAction.RESTART_WORKER + action = DiagnosisActionType.RESTART_WORKER else: - action = DiagnosisAction.RELAUNCH_WORKER + action = DiagnosisActionType.RELAUNCH_WORKER self._process_diagnose_action(action) if self._worker_group.state == WorkerState.FAILED: return run_result @@ -886,10 +887,10 @@ def _invoke_run(self, role: str = DEFAULT_ROLE) -> RunResult: raise Exception(f"[{role}] worker group in {state.name} state") def _process_diagnose_action(self, action: str): - if action == DiagnosisAction.RESTART_WORKER: + if action == DiagnosisActionType.RESTART_WORKER: self._remaining_failovers -= 1 self._restart_workers(self._worker_group) - elif action == DiagnosisAction.RELAUNCH_WORKER: + elif action == DiagnosisActionType.RELAUNCH_WORKER: self._stop_workers(self._worker_group) self._worker_group.state = WorkerState.FAILED diff --git a/dlrover/python/master/args.py b/dlrover/python/master/args.py index 05bf6bd35..177c91f71 100644 --- a/dlrover/python/master/args.py +++ b/dlrover/python/master/args.py @@ -86,6 +86,13 @@ def _build_master_args_parser(): type=pos_int, help="The number of nodes", ) + parser.add_argument( + "--hang_detection", + default=1, + type=pos_int, + help="The strategy of 'hang detection', " + "0: log only; 1: notify; 2: with fault tolerance", + ) add_params(parser) return parser diff --git a/dlrover/python/master/diagnosis/diagnosis.py b/dlrover/python/master/diagnosis/diagnosis.py index 17dd073ea..62b49c663 100644 --- a/dlrover/python/master/diagnosis/diagnosis.py +++ b/dlrover/python/master/diagnosis/diagnosis.py @@ -11,9 +11,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys import threading import time +from collections import deque from datetime import datetime, timedelta +from itertools import islice from typing import Dict, List from dlrover.python.common.log import default_logger as logger @@ -41,11 +44,17 @@ def has_expired(timestamp: float, time_period: int) -> bool: class DiagnosisManager: - def __init__(self): + def __init__(self, job_manager=None): + self._job_manager = job_manager self._is_observing_started = False - self._data_manager: DiagnosisDataManager = DiagnosisDataManager(600) + self._data_manager: DiagnosisDataManager = DiagnosisDataManager( + job_manager, 600 + ) self._diagnostician: Diagnostician = Diagnostician(self._data_manager) + def is_job_manager_exist(self) -> bool: + return self._job_manager is not None + def collect_diagnosis_data(self, data: DiagnosisData): self._data_manager.store_data(data) @@ -72,8 +81,8 @@ def start_observing(self): try: thread = threading.Thread( - target=self._diagnose_failures(), - name="diagnose_failures", + target=self._diagnose_failures, + name="failure_diagnosis", daemon=True, ) thread.start() @@ -94,10 +103,13 @@ def _diagnose_failures(self): if not self._is_observing_started: logger.info("Stop to diagnose failures for observing.") break + logger.info( + f"Current diagnosis data size: {self._data_manager.get_data_size()}." + ) observed_problems = self._diagnostician.observe_training() for problem in observed_problems: - logger.info(f"observed problems: {problem}") + logger.info(f"Observe problem in diagnosing: {problem}") root_causes = self._diagnostician.diagnose_failure(problem) for root_cause in root_causes: logger.info(f"identify root cause: {root_cause}") @@ -107,36 +119,53 @@ def _diagnose_failures(self): class DiagnosisDataManager: - def __init__(self, expire_time_period): - self.diagnosis_data: Dict[str, List[DiagnosisData]] = {} + def __init__(self, job_manager=None, expire_time_period=600): + self._diagnosis_data: Dict[str, deque[DiagnosisData]] = {} self.expire_time_period = expire_time_period + self._job_manager = job_manager + self._lock = threading.Lock() + + @property + def job_manager(self): + return self._job_manager + + @property + def data(self): + return self._diagnosis_data + + def with_runtime_context(self) -> bool: + return self.job_manager is not None def store_data(self, data: DiagnosisData): data_type = data.data_type - if data_type not in self.diagnosis_data: - logger.debug(f"{data_type} is not found in the store") - self.diagnosis_data[data_type] = [] - self.diagnosis_data[data_type].append(data) - self._clean_diagnosis_data(data_type) + with self._lock: + if data_type not in self.data: + self.data[data_type] = deque(maxlen=100000) + self.data[data_type].append(data) + self._clean_diagnosis_data(data_type) def get_data(self, data_type: str) -> List[DiagnosisData]: - if data_type not in self.diagnosis_data: - return [] - return self.diagnosis_data[data_type] + with self._lock: + if data_type not in self.data: + return [] + return list(self.data[data_type]) + + def get_data_size(self): + return sys.getsizeof(self.data) def _clean_diagnosis_data(self, data_type: str): - if data_type not in self.diagnosis_data: + if data_type not in self.data: return - data = self.diagnosis_data[data_type] + each_data = self.data[data_type] n = 0 - for d in data: + for d in each_data: if has_expired(d.timestamp, self.expire_time_period): n = n + 1 else: break - - self.diagnosis_data[data_type] = data[n:] + if n > 0: + self.data[data_type] = deque(islice(each_data, n, len(each_data))) class Diagnostician: diff --git a/dlrover/python/master/dist_master.py b/dlrover/python/master/dist_master.py index c68942e2c..cdff7d61f 100644 --- a/dlrover/python/master/dist_master.py +++ b/dlrover/python/master/dist_master.py @@ -143,7 +143,7 @@ def __init__( error_monitor ), } - self.diagnosis_manager = DiagnosisManager() + self.diagnosis_manager = DiagnosisManager(self.job_manager) self.job_metric_collector = self._create_metric_collector_if_needed( args ) diff --git a/dlrover/python/master/node/dist_job_manager.py b/dlrover/python/master/node/dist_job_manager.py index cb7151ace..5f53dbe9f 100644 --- a/dlrover/python/master/node/dist_job_manager.py +++ b/dlrover/python/master/node/dist_job_manager.py @@ -20,6 +20,8 @@ from datetime import datetime from typing import Dict, List, Optional +from diagnosis.common.constants import DiagnosisActionType + from dlrover.python.common.constants import ( DistributionStrategy, ElasticJobLabel, @@ -206,6 +208,11 @@ def start(self): worker_num += plan.node_group_resources[NodeType.CHIEF].count self._speed_monitor.set_target_worker_num(worker_num) self._training_node_config.set_node_num(worker_num) + threading.Thread( + target=self._diagnosis_action_consumer, + name="diagnosis_action_consumer", + daemon=True, + ).start() threading.Thread( target=self._monitor_nodes, name="node_monitor", daemon=True ).start() @@ -415,6 +422,34 @@ def _init_job_auto_scaler(self): "Create job autoscaler: %s", self._job_autoscaler.__class__ ) + def _diagnosis_action_consumer(self): + logger.info("Start consuming diagnosis actions.") + while True: + if self._stopped: + logger.info("Stop consuming diagnosis actions.") + break + try: + if self.get_diagnosis_actions_size() == 0: + time.sleep(5) + continue + + action = self._diagnosis_action_queue.get() + if action.type == DiagnosisActionType.EVENT: + self._report_event( + action.event_type, + action.instance, + action.action, + action.msg, + action.labels, + ) + elif action.type == DiagnosisActionType.MASTER_RELAUNCH_WORKER: + # TODO + pass + except Exception as e: + logger.warning(e) + time.sleep(10) + time.sleep(1) + def _monitor_nodes(self): logger.info("Start monitoring nodes events.") while True: @@ -1140,6 +1175,18 @@ def update_succeeded_node(self, node_id, node_type): with self._lock: super().update_succeeded_node(node_id, node_type) + def get_node_required_info(self): + return self._nodes_required + + def get_total_node_num_by_type(self, node_type): + if not self._job_nodes: + return 0 + + return len(self._job_nodes[node_type]) + + def get_job_strategy(self): + return self._job_args.distribution_strategy + def create_job_manager(args: JobArgs, speed_monitor) -> DistributedJobManager: critical_worker_index = get_critical_worker_index(args) diff --git a/dlrover/python/master/node/job_manager.py b/dlrover/python/master/node/job_manager.py index acec71396..c4a7f0526 100644 --- a/dlrover/python/master/node/job_manager.py +++ b/dlrover/python/master/node/job_manager.py @@ -12,10 +12,13 @@ # limitations under the License. from abc import ABCMeta, abstractmethod +from queue import Queue from typing import Dict from dlrover.python.common.log import default_logger as logger from dlrover.python.common.node import Node +from dlrover.python.diagnosis.common.constants import DiagnosisActionType +from dlrover.python.diagnosis.common.diagnosis_action import DiagnosisAction from dlrover.python.master.hyperparams.simple_strategy_generator import ( SimpleStrategyGenerator, ) @@ -54,10 +57,12 @@ def __init__( self._error_monitor: ErrorMonitor = error_monitor self._job_nodes: Dict[str, Dict[int, Node]] = {} - self._nodes_required = (0, 0, 0) + self._nodes_required = (0, 0, 0) # (min-nodes, max-nodes, timeout) self._training_node_config = TrainingNodeConfig(external_config) + self._diagnosis_action_queue = Queue() + @abstractmethod def start(self): pass @@ -112,9 +117,6 @@ def post_ps_ready(self): def stop(self): pass - def update_node_service_addr(self, node_type, node_id, service_addr): - pass - @abstractmethod def get_cur_cluster_ps(self): pass @@ -199,11 +201,20 @@ def collect_node_heart_beat(self, node_type, node_id, timestamp): """Collect the heart beat message of nodes.""" pass + def put_diagnosis_action(self, diagnosis_action: DiagnosisAction): + self._diagnosis_action_queue.put(diagnosis_action) + + def get_diagnosis_actions_size(self): + return self._diagnosis_action_queue.qsize() + def sync_node_training_port(self, node_id, port) -> SyncNodeTrainingPorts: return self._training_node_config.sync_node_training_port( node_id, port ) + def update_node_service_addr(self, node_type, node_id, service_addr): + pass + def update_node_required_info(self, min_required, max_required, timeout): """ Update the nodes min/max requirements. diff --git a/dlrover/python/master/servicer.py b/dlrover/python/master/servicer.py index e8250dc17..e980ea5a3 100644 --- a/dlrover/python/master/servicer.py +++ b/dlrover/python/master/servicer.py @@ -360,7 +360,7 @@ def report(self, request, _): elif isinstance(message, grpc.NodeCheckpointState): success = self._sync_checkpoint(node_type, node_id, message) elif isinstance(message, grpc.DiagnosisReportData): - success = self._report_worker_diagnosis_data(message) + success = self._report_node_diagnosis_data(message) elif isinstance(message, grpc.SucceededRequest): success = self._report_succeeded(node_id, node_type) @@ -618,7 +618,7 @@ def _sync_checkpoint( rdzv_manager = self._rdzv_managers[RendezvousName.ELASTIC_TRAINING] return rdzv_manager.sync_ckpt_nodes(node_id, message.step) - def _report_worker_diagnosis_data(self, message: grpc.DiagnosisReportData): + def _report_node_diagnosis_data(self, message: grpc.DiagnosisReportData): if self._diagnosis_manager: data_cls: Optional[DiagnosisData] = getattr( self._diagnosis_data_module, diff --git a/dlrover/python/tests/data/xpu_timer/hang/xpu_timer_metric_all b/dlrover/python/tests/data/xpu_timer/hang/xpu_timer_metric_all new file mode 100644 index 000000000..e384fb2de --- /dev/null +++ b/dlrover/python/tests/data/xpu_timer/hang/xpu_timer_metric_all @@ -0,0 +1,193 @@ +# HELP exposer_transferred_bytes_total Transferred bytes to metrics services +# TYPE exposer_transferred_bytes_total counter +exposer_transferred_bytes_total 12375203 +# HELP exposer_scrapes_total Number of times metrics were scraped +# TYPE exposer_scrapes_total counter +exposer_scrapes_total 6174 +# HELP exposer_request_latencies Latencies of serving scrape requests, in microseconds +# TYPE exposer_request_latencies summary +exposer_request_latencies_count 6174 +exposer_request_latencies_sum 6043374 +exposer_request_latencies{quantile="0.5"} 888 +exposer_request_latencies{quantile="0.9"} 888 +exposer_request_latencies{quantile="0.99"} 888 +# TYPE XPU_TIMER_MM_KERNEL_AVG_LATENCY gauge +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 19063 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3719 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78409 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 3673 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 3682 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 7500 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 23150 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 3675 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3662 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 7267 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 3678 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 7082 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 7076 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 8104 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 7587 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 3690 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 7732 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3511 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 18365 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 11184 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 17503 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 23387 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 6777 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 23613 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 79162 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 3694 +# TYPE XPU_TIMER_MM_KERNEL_MAX_LATENCY gauge +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 73411 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 4994 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78856 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 4011 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 4035 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 63087 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 71453 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 4035 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3990 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 63198 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 4001 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 63342 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 63214 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 63184 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 64022 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 4112 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 63349 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3971 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 71930 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 75577 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 76229 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 72937 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 63274 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 72472 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 80781 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 4085 +# TYPE XPU_TIMER_MM_KERNEL_P99_LATENCY gauge +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 73411 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 4994 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78856 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 4011 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 4025 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 63087 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 71453 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 4035 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3989 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 63198 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 4001 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 63342 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 63214 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 63184 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 64022 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 4112 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 63349 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3971 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 71930 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 75577 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 76229 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 72832 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 63274 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 72472 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 80781 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 4084 +# TYPE XPU_TIMER_MM_KERNEL_MIN_LATENCY gauge +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1183 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1257 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 77810 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 2995 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 2992 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1039 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1182 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 2996 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 2992 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1040 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 2991 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1037 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1039 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1058 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1049 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 2996 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1057 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 2997 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1181 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1200 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1205 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1180 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1042 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1182 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 77646 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 2991 +# TYPE XPU_TIMER_MM_KERNEL_FLOPS gauge +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 341.3345944233395 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 333.0872448878719 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 297.7779568156592 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 412.980679183614 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 412.7927898846224 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 371.6654952875318 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 333.9413147465587 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 412.6426014946334 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 414.2250512081811 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 365.3253468575526 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 414.2250512081811 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 366.8107237284316 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 366.655433637526 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 372.6319021584654 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 376.5763187831131 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 412.4764489189885 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 380.7275537933207 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 414.4304717174849 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 340.2784849108911 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 342.0186789595024 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 339.7178943261944 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 333.5362100380105 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 368.8886616386404 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 330.9267993677032 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 293.8160541599364 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 410.5513810707692 +# TYPE XPU_TIMER_COMMON_HANG gauge +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1 +# TYPE XPU_TIMER_COMMON_START_DUMP gauge +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_END_DUMP gauge +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_POOL_QUEUE_SIZE gauge +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 23 +# TYPE XPU_TIMER_COMMON_WORK_QUEUE_SIZE gauge +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 14 diff --git a/dlrover/python/tests/data/xpu_timer/hang/xpu_timer_metric_some b/dlrover/python/tests/data/xpu_timer/hang/xpu_timer_metric_some new file mode 100644 index 000000000..59142aa4c --- /dev/null +++ b/dlrover/python/tests/data/xpu_timer/hang/xpu_timer_metric_some @@ -0,0 +1,193 @@ +# HELP exposer_transferred_bytes_total Transferred bytes to metrics services +# TYPE exposer_transferred_bytes_total counter +exposer_transferred_bytes_total 12375203 +# HELP exposer_scrapes_total Number of times metrics were scraped +# TYPE exposer_scrapes_total counter +exposer_scrapes_total 6174 +# HELP exposer_request_latencies Latencies of serving scrape requests, in microseconds +# TYPE exposer_request_latencies summary +exposer_request_latencies_count 6174 +exposer_request_latencies_sum 6043374 +exposer_request_latencies{quantile="0.5"} 888 +exposer_request_latencies{quantile="0.9"} 888 +exposer_request_latencies{quantile="0.99"} 888 +# TYPE XPU_TIMER_MM_KERNEL_AVG_LATENCY gauge +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 19063 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3719 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78409 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 3673 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 3682 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 7500 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 23150 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 3675 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3662 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 7267 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 3678 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 7082 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 7076 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 8104 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 7587 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 3690 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 7732 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3511 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 18365 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 11184 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 17503 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 23387 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 6777 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 23613 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 79162 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 3694 +# TYPE XPU_TIMER_MM_KERNEL_MAX_LATENCY gauge +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 73411 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 4994 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78856 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 4011 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 4035 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 63087 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 71453 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 4035 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3990 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 63198 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 4001 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 63342 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 63214 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 63184 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 64022 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 4112 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 63349 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3971 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 71930 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 75577 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 76229 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 72937 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 63274 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 72472 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 80781 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 4085 +# TYPE XPU_TIMER_MM_KERNEL_P99_LATENCY gauge +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 73411 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 4994 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78856 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 4011 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 4025 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 63087 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 71453 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 4035 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3989 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 63198 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 4001 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 63342 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 63214 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 63184 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 64022 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 4112 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 63349 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3971 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 71930 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 75577 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 76229 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 72832 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 63274 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 72472 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 80781 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 4084 +# TYPE XPU_TIMER_MM_KERNEL_MIN_LATENCY gauge +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1183 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1257 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 77810 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 2995 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 2992 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1039 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1182 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 2996 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 2992 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1040 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 2991 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1037 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1039 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1058 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1049 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 2996 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1057 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 2997 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1181 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1200 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1205 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1180 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1042 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1182 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 77646 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 2991 +# TYPE XPU_TIMER_MM_KERNEL_FLOPS gauge +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 341.3345944233395 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 333.0872448878719 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 297.7779568156592 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 412.980679183614 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 412.7927898846224 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 371.6654952875318 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 333.9413147465587 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 412.6426014946334 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 414.2250512081811 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 365.3253468575526 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 414.2250512081811 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 366.8107237284316 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 366.655433637526 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 372.6319021584654 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 376.5763187831131 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 412.4764489189885 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 380.7275537933207 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 414.4304717174849 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 340.2784849108911 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 342.0186789595024 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 339.7178943261944 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 333.5362100380105 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 368.8886616386404 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 330.9267993677032 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 293.8160541599364 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 410.5513810707692 +# TYPE XPU_TIMER_COMMON_HANG gauge +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_START_DUMP gauge +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_END_DUMP gauge +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_POOL_QUEUE_SIZE gauge +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 23 +# TYPE XPU_TIMER_COMMON_WORK_QUEUE_SIZE gauge +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 14 diff --git a/dlrover/python/tests/data/xpu_timer_metrics b/dlrover/python/tests/data/xpu_timer/normal/xpu_timer_metric_0 similarity index 100% rename from dlrover/python/tests/data/xpu_timer_metrics rename to dlrover/python/tests/data/xpu_timer/normal/xpu_timer_metric_0 diff --git a/dlrover/python/tests/data/xpu_timer/xpu_timer_metric_single b/dlrover/python/tests/data/xpu_timer/xpu_timer_metric_single new file mode 100644 index 000000000..0e646c2ab --- /dev/null +++ b/dlrover/python/tests/data/xpu_timer/xpu_timer_metric_single @@ -0,0 +1,193 @@ +# HELP exposer_transferred_bytes_total Transferred bytes to metrics services +# TYPE exposer_transferred_bytes_total counter +exposer_transferred_bytes_total 12375203 +# HELP exposer_scrapes_total Number of times metrics were scraped +# TYPE exposer_scrapes_total counter +exposer_scrapes_total 6174 +# HELP exposer_request_latencies Latencies of serving scrape requests, in microseconds +# TYPE exposer_request_latencies summary +exposer_request_latencies_count 6174 +exposer_request_latencies_sum 6043374 +exposer_request_latencies{quantile="0.5"} 888 +exposer_request_latencies{quantile="0.9"} 888 +exposer_request_latencies{quantile="0.99"} 888 +# TYPE XPU_TIMER_MM_KERNEL_AVG_LATENCY gauge +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 19063 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3719 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78409 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 3673 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 3682 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 7500 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 23150 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 3675 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3662 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 7267 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 3678 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 7082 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 7076 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 8104 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 7587 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 3690 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 7732 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3511 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 18365 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 11184 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 17503 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 23387 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 6777 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 23613 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 79162 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 3694 +# TYPE XPU_TIMER_MM_KERNEL_MAX_LATENCY gauge +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 73411 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 4994 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78856 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 4011 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 4035 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 63087 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 71453 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 4035 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3990 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 63198 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 4001 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 63342 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 63214 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 63184 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 64022 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 4112 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 63349 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3971 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 71930 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 75577 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 76229 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 72937 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 63274 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 72472 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 80781 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 4085 +# TYPE XPU_TIMER_MM_KERNEL_P99_LATENCY gauge +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 73411 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 4994 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78856 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 4011 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 4025 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 63087 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 71453 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 4035 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3989 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 63198 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 4001 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 63342 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 63214 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 63184 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 64022 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 4112 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 63349 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3971 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 71930 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 75577 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 76229 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 72832 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 63274 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 72472 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 80781 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 4084 +# TYPE XPU_TIMER_MM_KERNEL_MIN_LATENCY gauge +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1183 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1257 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 77810 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 2995 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 2992 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1039 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1182 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 2996 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 2992 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1040 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 2991 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1037 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1039 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1058 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1049 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 2996 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1057 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 2997 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1181 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1200 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1205 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1180 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1042 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1182 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 77646 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 2991 +# TYPE XPU_TIMER_MM_KERNEL_FLOPS gauge +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 341.3345944233395 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 333.0872448878719 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 297.7779568156592 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 412.980679183614 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 412.7927898846224 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 371.6654952875318 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 333.9413147465587 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 412.6426014946334 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 414.2250512081811 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 365.3253468575526 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 414.2250512081811 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 366.8107237284316 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 366.655433637526 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 372.6319021584654 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 376.5763187831131 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 412.4764489189885 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 380.7275537933207 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 414.4304717174849 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 340.2784849108911 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 342.0186789595024 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 339.7178943261944 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 333.5362100380105 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 368.8886616386404 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 330.9267993677032 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 293.8160541599364 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 410.5513810707692 +# TYPE XPU_TIMER_COMMON_HANG gauge +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_START_DUMP gauge +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_END_DUMP gauge +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_POOL_QUEUE_SIZE gauge +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 23 +# TYPE XPU_TIMER_COMMON_WORK_QUEUE_SIZE gauge +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 14 diff --git a/dlrover/python/tests/test_diagnosis.py b/dlrover/python/tests/test_diagnosis.py index 501dd1d08..6bc1ef171 100644 --- a/dlrover/python/tests/test_diagnosis.py +++ b/dlrover/python/tests/test_diagnosis.py @@ -14,7 +14,16 @@ import time import unittest -from dlrover.python.diagnosis.common.constants import DiagnosisDataType +from dlrover.python.common.constants import NodeStatus +from dlrover.python.diagnosis.common.constants import ( + DiagnosisActionType, + DiagnosisDataType, +) +from dlrover.python.diagnosis.common.diagnosis_action import ( + DiagnosisAction, + EventAction, + NodeRelaunchAction, +) from dlrover.python.diagnosis.common.diagnosis_data import TrainingLog from dlrover.python.master.diagnosis.diagnosis import DiagnosisDataManager @@ -27,7 +36,7 @@ def tearDown(self): pass def test_data_manager(self): - mgr = DiagnosisDataManager(5) + mgr = DiagnosisDataManager(expire_time_period=3) log1 = TrainingLog(0) mgr.store_data(log1) time.sleep(1) @@ -37,12 +46,39 @@ def test_data_manager(self): logs = mgr.get_data(DiagnosisDataType.TRAINING_LOG) self.assertEqual(len(logs), 2) - time.sleep(6) + time.sleep(4) log3 = TrainingLog(0) mgr.store_data(log3) logs = mgr.get_data(DiagnosisDataType.TRAINING_LOG) self.assertEqual(len(logs), 1) + def test_action_basic(self): + basic_action = DiagnosisAction() + self.assertEqual( + basic_action.diagnosis_type, DiagnosisActionType.NO_ACTION + ) + + event_action = EventAction( + "info", "job", "test", "test123", {"k1": "v1"} + ) + self.assertEqual( + event_action.diagnosis_type, DiagnosisActionType.EVENT + ) + self.assertEqual(event_action.event_type, "info") + self.assertEqual(event_action.instance, "job") + self.assertEqual(event_action.action, "test") + self.assertEqual(event_action.msg, "test123") + self.assertEqual(event_action.labels, {"k1": "v1"}) + + node_relaunch_action = NodeRelaunchAction(1, NodeStatus.FAILED, "hang") + self.assertEqual( + node_relaunch_action.diagnosis_type, + DiagnosisActionType.MASTER_RELAUNCH_WORKER, + ) + self.assertEqual(node_relaunch_action.node_id, 1) + self.assertEqual(node_relaunch_action.node_status, NodeStatus.FAILED) + self.assertEqual(node_relaunch_action.reason, "hang") + if __name__ == "__main__": unittest.main() diff --git a/dlrover/python/tests/test_diagnosis_agent.py b/dlrover/python/tests/test_diagnosis_agent.py index c6770f677..ce9d2a76c 100644 --- a/dlrover/python/tests/test_diagnosis_agent.py +++ b/dlrover/python/tests/test_diagnosis_agent.py @@ -20,7 +20,7 @@ from dlrover.python.common import env_utils from dlrover.python.common.constants import RendezvousName from dlrover.python.common.worker import WorkerContext -from dlrover.python.diagnosis.common.constants import DiagnosisAction +from dlrover.python.diagnosis.common.constants import DiagnosisActionType from dlrover.python.diagnosis.common.diagnosis_data import WorkerTrainingMetric from dlrover.python.elastic_agent.diagnosis.diagnosis_agent import ( DiagnosisAgent, @@ -82,21 +82,21 @@ def test_diagnose_training(self): ) action = agent.diagnose_training_failure(wc) - self.assertEqual(action, DiagnosisAction.RESTART_WORKER) + self.assertEqual(action, DiagnosisActionType.RESTART_WORKER) agent._errors = "error code is 507035" action = agent.diagnose_training_failure(wc) - self.assertEqual(action, DiagnosisAction.RELAUNCH_WORKER) + self.assertEqual(action, DiagnosisActionType.RELAUNCH_WORKER) agent._errors = "error code is 11111" wc.remaining_failovers = 0 action = agent.diagnose_training_failure(wc) - self.assertEqual(action, DiagnosisAction.RELAUNCH_WORKER) + self.assertEqual(action, DiagnosisActionType.RELAUNCH_WORKER) agent._errors = " #" wc.remaining_failovers = 2 action = agent.diagnose_training_failure(wc) - self.assertEqual(action, DiagnosisAction.RESTART_WORKER) + self.assertEqual(action, DiagnosisActionType.RESTART_WORKER) def test_worker_training_metric(self): test = WorkerTrainingMetric( diff --git a/dlrover/python/tests/test_diagnosis_data_collector.py b/dlrover/python/tests/test_diagnosis_data_collector.py index 7a6eb10e0..a69ea2daa 100644 --- a/dlrover/python/tests/test_diagnosis_data_collector.py +++ b/dlrover/python/tests/test_diagnosis_data_collector.py @@ -71,7 +71,7 @@ def test_xpu_timer_metric_collector(self): self.assertEqual(collector.collect_data(), "") - file = "data/xpu_timer_metrics" + file = "data/xpu_timer/xpu_timer_metric_single" file_path = os.path.join(os.path.dirname(__file__), file) with open(file_path, "r", encoding="utf-8") as file: test_metrics = file.read() diff --git a/dlrover/python/tests/test_inference_chain.py b/dlrover/python/tests/test_inference_chain.py index 61a37c160..a44186078 100644 --- a/dlrover/python/tests/test_inference_chain.py +++ b/dlrover/python/tests/test_inference_chain.py @@ -13,14 +13,18 @@ import os import unittest +from typing import Dict, List, Tuple +from unittest import mock from unittest.mock import patch from dlrover.python.common import env_utils from dlrover.python.common.constants import NodeEnv, NodeType from dlrover.python.diagnosis.common.constants import ( + DiagnosisDataType, EnvConfigKey, InferenceConfigKey, ) +from dlrover.python.diagnosis.common.diagnosis_data import WorkerTrainingMetric from dlrover.python.diagnosis.common.inference_chain import ( Inference, InferenceAttribute, @@ -55,7 +59,256 @@ def setUp(self): def tearDown(self): os.environ.clear() + def test_check_training_hang_operator_find_intersection(self): + test_metric: Dict[int, List[Tuple[int, bool]]] = { + 1: [(1, True), (2, False), (3, True), (4, True), (5, True)], + 2: [(1, True), (2, True), (3, True), (4, True), (5, False)], + 3: [(1, False), (2, True), (3, True), (4, True), (5, True)], + } + operator = CheckTrainingHangOperator(None) + self.assertEqual( + operator._find_hang_intersection(test_metric), (-1, -1) + ) + + test_metric: Dict[int, List[Tuple[int, bool]]] = { + 1: [ + (1, True), + (2, False), + (3, True), + (4, True), + (5, True), + (6, True), + (7, True), + ], + 2: [ + (1, True), + (2, True), + (3, True), + (4, True), + (5, False), + (6, True), + (7, True), + ], + 3: [ + (1, False), + (2, True), + (3, True), + (4, True), + (5, True), + (6, True), + (7, True), + ], + } + operator = CheckTrainingHangOperator(None) + self.assertEqual(operator._find_hang_intersection(test_metric), (2, 1)) + + test_metric: Dict[int, List[Tuple[int, bool]]] = { + 1: [ + (1, True), + (2, False), + (3, True), + (4, True), + (5, True), + (6, True), + (8, True), + ], + 2: [ + (1, True), + (2, True), + (3, True), + (4, True), + (5, False), + (6, True), + (8, True), + ], + 3: [ + (1, False), + (2, True), + (3, True), + (4, True), + (5, True), + (6, True), + (8, True), + ], + } + operator = CheckTrainingHangOperator(None) + self.assertEqual(operator._find_hang_intersection(test_metric), (2, 2)) + + test_metric: Dict[int, List[Tuple[int, bool]]] = { + 1: [ + (1, True), + (2, False), + (3, True), + (4, True), + (5, True), + (6, True), + (8, False), + ], + 2: [ + (1, True), + (2, True), + (3, True), + (4, True), + (5, False), + (6, True), + (8, True), + ], + 3: [ + (1, False), + (2, True), + (3, True), + (4, True), + (5, True), + (6, True), + (8, True), + ], + } + operator = CheckTrainingHangOperator(None) + self.assertEqual( + operator._find_hang_intersection(test_metric), (-1, -1) + ) + + def test_check_training_hang_operator_is_hang(self): + operator = CheckTrainingHangOperator(None) + operator._get_hang_time_last_threshold = mock.MagicMock(return_value=0) + + # prepare test data + normal_metric, some_abnormal_metric, all_abnormal_metric = "", "", "" + file_path = os.path.join( + os.path.dirname(__file__), + "data/xpu_timer/normal/xpu_timer_metric_0", + ) + with open(file_path, "r", encoding="utf-8") as file: + normal_metric = file.read() + file_path = os.path.join( + os.path.dirname(__file__), + "data/xpu_timer/hang/xpu_timer_metric_some", + ) + with open(file_path, "r", encoding="utf-8") as file: + some_abnormal_metric = file.read() + file_path = os.path.join( + os.path.dirname(__file__), + "data/xpu_timer/hang/xpu_timer_metric_all", + ) + with open(file_path, "r", encoding="utf-8") as file: + all_abnormal_metric = file.read() + + # test data: no worker hang + w0_t1 = WorkerTrainingMetric( + timestamp=1, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=normal_metric, + node_id=0, + node_type="worker", + node_rank=0, + ) + w0_t2 = WorkerTrainingMetric( + timestamp=2, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=normal_metric, + node_id=0, + node_type="worker", + node_rank=0, + ) + w1_t1 = WorkerTrainingMetric( + timestamp=1, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=normal_metric, + node_id=1, + node_type="worker", + node_rank=1, + ) + w1_t2 = WorkerTrainingMetric( + timestamp=2, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=normal_metric, + node_id=1, + node_type="worker", + node_rank=1, + ) + test_data = [w0_t1, w1_t1, w0_t2, w1_t2] + + self.assertFalse(operator.is_hang(test_data)) + test_data.clear() + + # test data0: 1 of 2 worker hang + w0_t1 = WorkerTrainingMetric( + timestamp=1, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=some_abnormal_metric, + node_id=0, + node_type="worker", + node_rank=0, + ) + w0_t2 = WorkerTrainingMetric( + timestamp=2, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=some_abnormal_metric, + node_id=0, + node_type="worker", + node_rank=0, + ) + w1_t1 = WorkerTrainingMetric( + timestamp=1, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=some_abnormal_metric, + node_id=1, + node_type="worker", + node_rank=1, + ) + w1_t2 = WorkerTrainingMetric( + timestamp=2, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=some_abnormal_metric, + node_id=1, + node_type="worker", + node_rank=1, + ) + test_data = [w0_t1, w1_t1, w0_t2, w1_t2] + + self.assertFalse(operator.is_hang(test_data)) + test_data.clear() + + # test data: 2 of 2 worker hang + w0_t1 = WorkerTrainingMetric( + timestamp=1, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=all_abnormal_metric, + node_id=0, + node_type="worker", + node_rank=0, + ) + w0_t2 = WorkerTrainingMetric( + timestamp=2, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=all_abnormal_metric, + node_id=0, + node_type="worker", + node_rank=0, + ) + w1_t1 = WorkerTrainingMetric( + timestamp=1, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=all_abnormal_metric, + node_id=1, + node_type="worker", + node_rank=1, + ) + w1_t2 = WorkerTrainingMetric( + timestamp=2, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=all_abnormal_metric, + node_id=1, + node_type="worker", + node_rank=1, + ) + test_data = [w0_t1, w1_t1, w0_t2, w1_t2] + + self.assertTrue(operator.is_hang(test_data)) + test_data.clear() + def test_check_training_hang_operator(self): + # no data operator = CheckTrainingHangOperator(None) inf = Inference( name=InferenceName.TRAINING, diff --git a/dlrover/python/tests/test_job_manager.py b/dlrover/python/tests/test_job_manager.py index 770ecde85..1c28871fd 100644 --- a/dlrover/python/tests/test_job_manager.py +++ b/dlrover/python/tests/test_job_manager.py @@ -672,6 +672,7 @@ def test_start_and_stop(self): manager.start() active_threads_name = [t.name for t in threading.enumerate()] + self.assertIn("diagnosis_action_consumer", active_threads_name) self.assertIn("node_monitor", active_threads_name) self.assertIn("node_heart_beat_monitor", active_threads_name) manager.stop() @@ -721,6 +722,19 @@ def test_get_pending_timeout(self): # reset _dlrover_context.seconds_to_wait_pending_pod = 900 + def test_multi_getting(self): + params = MockK8sPSJobArgs() + params.initilize() + manager = create_job_manager(params, SpeedMonitor()) + self.assertEqual(manager.get_total_node_num_by_type(NodeType.PS), 0) + manager._init_nodes() + + self.assertEqual(manager.get_job_strategy(), DistributionStrategy.PS) + self.assertEqual(manager.get_total_node_num_by_type(NodeType.PS), 3) + self.assertEqual(manager.get_node_required_info(), (0, 0, 0)) + manager._nodes_required = (3, 5, 100) + self.assertEqual(manager.get_node_required_info(), (3, 5, 100)) + class LocalJobManagerTest(unittest.TestCase): def test_local_job_manager(self): @@ -767,3 +781,5 @@ def test_local_job_manager(self): job_manager.update_succeeded_node(0, "unknown") except Exception: self.fail() + + self.assertEqual(job_manager.get_diagnosis_actions_size(), 0) diff --git a/dlrover/python/tests/test_servicer.py b/dlrover/python/tests/test_servicer.py index b30b6e999..799383825 100644 --- a/dlrover/python/tests/test_servicer.py +++ b/dlrover/python/tests/test_servicer.py @@ -408,7 +408,7 @@ def test_sync_checkpoint(self): success = self.servicer._sync_checkpoint(NodeType.WORKER, 1, message) self.assertTrue(success) - def test_report_worker_diagnosis_data(self): + def test_report_node_diagnosis_data(self): test = WorkerTrainingMetric( data_content="test123", node_id=env_utils.get_node_id(), @@ -422,7 +422,7 @@ def test_report_worker_diagnosis_data(self): test.to_json(), test.node_rank, ) - self.assertTrue(self.servicer._report_worker_diagnosis_data(request)) + self.assertTrue(self.servicer._report_node_diagnosis_data(request)) def test_report_succeeded(self): self.assertTrue(self.servicer._report_succeeded(0, NodeType.WORKER))