Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refine device type #488

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions configs/common/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@
# NOTE: if it is None, LiBai will automatically set pipeline_stage_id
# `auto_pipeline_stage_id` and `actual_pipeline_stage_id` will be saved in `config.yaml`
custom_pipeline_stage_id=None,
# set device type
device_type="cuda",
),

# the device type of input tensors for model, defaults to "cuda".
Expand Down
22 changes: 14 additions & 8 deletions libai/inference/generator/generation_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@


class Generator:
dist_utils = dist.get_dist_util()
device_type = dist_utils.device_type

def _prepare_model_inputs(
self,
inputs: Optional[flow.Tensor] = None,
Expand Down Expand Up @@ -101,7 +104,7 @@ def _prepare_input_ids_for_generation(
shape,
dtype=flow.long,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=flow.placement("cuda", list(range(dist.get_world_size()))),
placement=flow.placement(self.device_type, list(range(dist.get_world_size()))),
)
* -100
)
Expand All @@ -113,7 +116,7 @@ def _prepare_input_ids_for_generation(
(1, 1),
dtype=flow.long,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=flow.placement("cuda", list(range(dist.get_world_size()))),
placement=flow.placement(self.device_type, list(range(dist.get_world_size()))),
)
* bos_token_id
)
Expand All @@ -137,7 +140,7 @@ def _prepare_attention_mask_for_generation(
inputs.shape[:2],
dtype=flow.bool,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=flow.placement("cuda", list(range(dist.get_world_size()))),
placement=flow.placement(dist.device_type, list(range(dist.get_world_size()))),
)

def _prepare_encoder_decoder_kwargs_for_generation(
Expand Down Expand Up @@ -171,7 +174,7 @@ def _prepare_decoder_input_ids_for_generation(
(batch_size, 1),
dtype=flow.long,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=flow.placement("cuda", list(range(dist.get_world_size()))),
placement=flow.placement(dist.device_type, list(range(dist.get_world_size()))),
)
* decoder_start_token_id
)
Expand All @@ -195,14 +198,15 @@ def _expand_inputs_for_generation(
is_encoder_decoder: bool = False,
attention_mask: Optional[flow.Tensor] = None,
encoder_outputs: Optional[flow.Tensor] = None,
device_type="cuda",
**model_kwargs,
):
expanded_return_idx = (
flow.arange(input_ids.shape[0]).view(-1, 1).repeat(1, expand_size).view(-1)
)
expanded_return_idx = expanded_return_idx.to_global(
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=flow.placement("cuda", list(range(dist.get_world_size()))),
placement=flow.placement(device_type, list(range(dist.get_world_size()))),
)

input_ids = input_ids.index_select(0, expanded_return_idx)
Expand Down Expand Up @@ -589,12 +593,12 @@ def multinomial_sample(
probs = nn.functional.softmax(next_token_scores, dim=-1)
probs = probs.to_global(
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=flow.placement("cuda", list(range(dist.get_world_size()))),
placement=flow.placement(self.device_type, list(range(dist.get_world_size()))),
).to_local()
next_tokens = flow.multinomial(probs, num_samples=1).squeeze(1)
next_tokens = next_tokens.to_global(
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=flow.placement("cuda", list(range(dist.get_world_size()))),
placement=flow.placement(self.device_type, list(range(dist.get_world_size()))),
)
unfinished_sequences = unfinished_sequences.to_global(
sbp=next_tokens.sbp, placement=next_tokens.placement
Expand Down Expand Up @@ -687,7 +691,7 @@ def beam_search(
(batch_size, num_beams),
dtype=flow.float,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=flow.placement("cuda", list(range(dist.get_world_size()))),
placement=flow.placement(self.device_type, list(range(dist.get_world_size()))),
)
beam_scores[:, 1:] = -1e9
beam_scores = beam_scores.view((batch_size * num_beams,))
Expand Down Expand Up @@ -1019,6 +1023,7 @@ def generate(
input_ids,
expand_size=num_return_sequences,
is_encoder_decoder=self.cfg.is_encoder_decoder,
device_type=self.device_type,
**model_kwargs,
)

Expand Down Expand Up @@ -1057,6 +1062,7 @@ def generate(
input_ids,
expand_size=num_beams,
is_encoder_decoder=self.cfg.is_encoder_decoder,
device_type=self.device_type,
**model_kwargs,
)

Expand Down
5 changes: 4 additions & 1 deletion libai/tokenizer/tokenization_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ class PreTrainedTokenizer(object):
pretrained_vocab_files_map = {}
pretrained_init_configuration = {}
max_model_input_sizes = {}
dist_utils = dist.get_dist_util()
device_type = dist_utils.device_type

SPECIAL_TOKENS_ATTRIBUTES = [
"bos_token",
Expand Down Expand Up @@ -783,7 +785,8 @@ def convert_to_tensors(self, token_ids, return_tensors=None, is_global=False, **
elif is_global:
sbp = kwargs.get("sbp", dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]))
placement = kwargs.get(
"placement", flow.placement("cuda", list(range(dist.get_world_size())))
"placement",
flow.placement(self.device_type, list(range(dist.get_world_size()))),
)
return_token_ids = flow.tensor(
token_ids, sbp=sbp, placement=placement, dtype=flow.long
Expand Down
4 changes: 3 additions & 1 deletion libai/utils/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# limitations under the License.

import logging
import os

import dill
import numpy as np
Expand Down Expand Up @@ -438,7 +439,8 @@ def convert_to_distributed_default_setting(t):
def ttol(tensor, pure_local=False, ranks=None):
"""Global tensor to local tensor."""
if tensor.is_global:
placement = tensor.placement if not ranks else flow.placement("cuda", ranks)
device_type = os.getenv("DEVICE_TYPE", "cuda")
placement = tensor.placement if not ranks else flow.placement(device_type, ranks)
if pure_local:
tensor = tensor.to_global(placement=placement).to_local()
else:
Expand Down
20 changes: 14 additions & 6 deletions projects/MagicPrompt/layers/attention_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,12 +208,20 @@ def forward(
causal_mask = causal_mask.repeat(attention_scores.size(0), 1, 1, 1)
causal_mask = causal_mask.to_global(placement=attention_scores.placement)
fill_value = flow.finfo(attention_scores.dtype).min
mask_value = flow.ones(
causal_mask.size(),
dtype=attention_scores.dtype,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=attention_scores.placement,
).fill_(fill_value)
if causal_mask.shape[0] == 1:
mask_value = flow.ones(
causal_mask.size(),
dtype=attention_scores.dtype,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=attention_scores.placement,
).fill_(fill_value)
else:
mask_value = flow.ones(
causal_mask.size(),
dtype=attention_scores.dtype,
sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.broadcast]),
placement=attention_scores.placement,
).fill_(fill_value)
attention_scores = flow.where(causal_mask, attention_scores, mask_value)

if attention_mask is not None:
Expand Down