Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expert parallel #39

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions dolomite_engine/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,8 @@ class DistributedArgs(BaseArgs):
tensor_parallel_word_embeddings: bool = False
# whether to use sequence parallel
sequence_parallel: bool = False
# whether to use expert parallel
expert_parallel: bool = False
# data parallel world size
data_parallel_size: int | None = None
# distributed timeout for NCCL in minutes
Expand Down
2 changes: 2 additions & 0 deletions dolomite_engine/hf_models/mixins/moe_TP/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class PreTrainedMoEModelMixin_TP(PreTrainedMoEModelMixin, PreTrainedModelMixin_T
def __init__(self, config: CommonConfig, *args, **kwargs):
self.tensor_parallel_word_embeddings = kwargs.get("tensor_parallel_word_embeddings", False)
self.sequence_parallel = kwargs.get("sequence_parallel", False)
self.expert_parallel = kwargs.get("expert_parallel", False)

super().__init__(config, *args, **kwargs)

Expand Down Expand Up @@ -55,6 +56,7 @@ def _init_model(self, config: CommonConfig, **kwargs) -> None:
moe_implementation=self.moe_implementation,
layer_idx=i,
sequence_parallel=self.sequence_parallel,
expert_parallel=self.expert_parallel,
)
for i in range(config.num_hidden_layers)
]
Expand Down
6 changes: 3 additions & 3 deletions dolomite_engine/hf_models/models/moe_dolomite/moe/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def __init__(
std /= math.sqrt(m_width)
self.gate = ParameterizedTransposedLinear(
in_features=self.hidden_size,
out_features=config.num_experts,
out_features=self.num_experts,
bias=False,
std=std,
)
Expand All @@ -98,7 +98,7 @@ def __init__(
if init_method == InitMethod.mup:
std /= math.sqrt(m_width)
self.c_fc = ParameterizedExperts(
num_experts=config.num_experts,
num_experts=self.num_experts,
in_features=self.hidden_size,
out_features=2 * self.intermediate_size if is_glu(activation_function) else self.intermediate_size,
add_bias=config.add_bias,
Expand All @@ -111,7 +111,7 @@ def __init__(
if init_method == InitMethod.mup:
std /= math.sqrt(m_width)
self.c_proj = ParameterizedExperts(
num_experts=config.num_experts,
num_experts=self.num_experts,
in_features=self.intermediate_size,
out_features=self.hidden_size,
add_bias=config.add_bias,
Expand Down
6 changes: 3 additions & 3 deletions dolomite_engine/hf_models/models/moe_dolomite/moe/scatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def __init__(
std /= math.sqrt(m_width)
self.gate = ParameterizedTransposedLinear(
in_features=self.hidden_size,
out_features=config.num_experts,
out_features=self.num_experts,
bias=False,
std=std,
)
Expand All @@ -93,7 +93,7 @@ def __init__(
if init_method == InitMethod.mup:
std /= math.sqrt(m_width)
self.c_fc = ParameterizedScatteredExperts(
num_experts=config.num_experts,
num_experts=self.num_experts,
in_features=self.hidden_size,
out_features=2 * self.intermediate_size if is_glu(activation_function) else self.intermediate_size,
add_bias=config.add_bias,
Expand All @@ -106,7 +106,7 @@ def __init__(
if init_method == InitMethod.mup:
std /= math.sqrt(m_width)
self.c_proj = ParameterizedScatteredExperts(
num_experts=config.num_experts,
num_experts=self.num_experts,
in_features=self.intermediate_size,
out_features=self.hidden_size,
add_bias=config.add_bias,
Expand Down
4 changes: 3 additions & 1 deletion dolomite_engine/hf_models/models/moe_dolomite_TP/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from ..gpt_dolomite_TP.layer import MLP_TP
from ..moe_dolomite import MoEDolomiteConfig
from ..moe_dolomite.layer import SparseMoEBlock
from .moe_TP.scatter import ScatterMoE_TP
from .moe_TP import ScatterMoE_TP


class SparseMoEBlock_TP(SparseMoEBlock):
Expand All @@ -20,6 +20,7 @@ def __init__(
moe_implementation: str,
layer_idx: int | None = None,
sequence_parallel: bool = False,
expert_parallel: bool = False,
) -> None:
nn.Module.__init__(self)

Expand Down Expand Up @@ -59,6 +60,7 @@ def __init__(
config,
use_padding_free_transformer=use_padding_free_transformer,
sequence_parallel=sequence_parallel,
expert_parallel=expert_parallel,
layer_idx=layer_idx,
)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .scatter import ScatterMoE_TP
Loading
Loading