-
Notifications
You must be signed in to change notification settings - Fork 157
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge remote-tracking branch 'origin/master' into optimize_heartbeat_…
…timeout_judgement
- Loading branch information
Showing
10 changed files
with
332 additions
and
96 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
name: Close stale issues | ||
|
||
on: | ||
schedule: | ||
- cron: '0 0 * * *' # run every day | ||
|
||
jobs: | ||
stale: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/stale@v4 | ||
with: | ||
repo-token: ${{ secrets.GITHUB_TOKEN }} | ||
stale-issue-message: 'This issue has been automatically marked as stale because it has not had recent activity.' | ||
close-issue-message: 'This issue is being automatically closed due to inactivity.' | ||
days-before-stale: 90 | ||
days-before-close: 7 | ||
stale-pr-message: 'This pull request has been automatically marked as stale because it has not had recent activity.' | ||
close-pr-message: 'This pull request is being automatically closed due to inactivity.' | ||
stale-label: 'stale' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# Copyright 2024 The DLRover Authors. All rights reserved. | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from typing import List | ||
|
||
|
||
class DiagnoseAction: | ||
def __init__(self): | ||
self._actions: List[str] = [] | ||
|
||
def add_action(self, action: str): | ||
self._actions.append(action) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# Copyright 2024 The DLRover Authors. All rights reserved. | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from typing import List | ||
|
||
from dlrover.python.diagnosis.common.diagnose_action import DiagnoseAction | ||
from dlrover.python.diagnosis.common.inference_chain import Inference | ||
|
||
|
||
def coordinate_inferences(observations: List[Inference]) -> DiagnoseAction: | ||
return DiagnoseAction() |
65 changes: 65 additions & 0 deletions
65
dlrover/python/diagnosis/inferencechain/inferenceoperator/metrics_collection_operator.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# Copyright 2024 The DLRover Authors. All rights reserved. | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from typing import List | ||
|
||
from dlrover.python.common import env_utils | ||
from dlrover.python.diagnosis.common.constants import DiagnosisDataType | ||
from dlrover.python.diagnosis.common.diagnosis_data import WorkerTrainingMetric | ||
from dlrover.python.diagnosis.common.inference_chain import ( | ||
Inference, | ||
InferenceAttribute, | ||
InferenceDescription, | ||
InferenceName, | ||
InferenceOperator, | ||
) | ||
from dlrover.python.diagnosis.datacollector.xpu_timer_metric_collector import ( | ||
XpuTimerMetricsCollector, | ||
) | ||
from dlrover.python.elastic_agent.master_client import MasterClient | ||
|
||
|
||
class MetricsCollectionOperator(InferenceOperator): | ||
""" | ||
MetricsCollectionOperator is the operator to collect | ||
worker diagnosis metrics. | ||
""" | ||
|
||
def __init__(self): | ||
super().__init__(None) | ||
self._xpu_timer_collector = XpuTimerMetricsCollector() | ||
self._client = MasterClient.singleton_instance() | ||
|
||
def is_compatible(self, inference: Inference) -> bool: | ||
if ( | ||
inference.name == InferenceName.WORKER | ||
and inference.attribution == InferenceAttribute.COLLECT | ||
and inference.description == InferenceDescription.METRICS | ||
): | ||
return True | ||
else: | ||
return False | ||
|
||
def infer(self, inferences: List[Inference]) -> List[Inference]: | ||
xpu_timer_metric = self._xpu_timer_collector.collect_data() | ||
if xpu_timer_metric: | ||
agent_xpu_metric = WorkerTrainingMetric( | ||
data_type=DiagnosisDataType.XPU_TIMER_METRIC, | ||
data_content=xpu_timer_metric, | ||
node_id=env_utils.get_node_id(), | ||
node_type=env_utils.get_node_type(), | ||
node_rank=env_utils.get_node_rank(), | ||
) | ||
self._client.report_diagnosis_agent_metrics(agent_xpu_metric) | ||
|
||
return [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.