-
Notifications
You must be signed in to change notification settings - Fork 550
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #34 from chuanzhubin/master
对主要代码添加逐行注释,方便学习者快速理解
- Loading branch information
Showing
4 changed files
with
337 additions
and
319 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,58 +1,58 @@ | ||
from transformers import PretrainedConfig | ||
from typing import List | ||
|
||
|
||
# 定义 LMConfig 类,继承自 PretrainedConfig | ||
class LMConfig(PretrainedConfig): | ||
model_type = "minimind" | ||
model_type = "minimind" # 设置模型类型为 "minimind" | ||
|
||
def __init__( | ||
self, | ||
dim: int = 512, | ||
n_layers: int = 8, | ||
n_heads: int = 16, | ||
n_kv_heads: int = 8, | ||
vocab_size: int = 6400, | ||
hidden_dim: int = None, | ||
multiple_of: int = 64, | ||
norm_eps: float = 1e-5, | ||
max_seq_len: int = 512, | ||
dropout: float = 0.0, | ||
flash_attn: bool = True, | ||
dim: int = 512, # 模型维度,默认为 512 | ||
n_layers: int = 8, # Transformer 层数,默认为 8 | ||
n_heads: int = 16, # 注意力头数,默认为 16 | ||
n_kv_heads: int = 8, # KV 头数,默认为 8 | ||
vocab_size: int = 6400, # 词汇表大小,默认为 6400 | ||
hidden_dim: int = None, # 隐藏层维度,默认为 None | ||
multiple_of: int = 64, # 隐藏层维度的倍数,默认为 64 | ||
norm_eps: float = 1e-5, # 归一化层的 epsilon 值,默认为 1e-5 | ||
max_seq_len: int = 512, # 最大序列长度,默认为 512 | ||
dropout: float = 0.0, # Dropout 概率,默认为 0.0 | ||
flash_attn: bool = True, # 是否使用 Flash Attention,默认为 True | ||
#################################################### | ||
# Here are the specific configurations of MOE | ||
# When use_moe is false, the following is invalid | ||
# 以下是 MOE(Mixture of Experts)的特定配置 | ||
# 当 use_moe 为 False 时,以下配置无效 | ||
#################################################### | ||
use_moe: bool = False, | ||
num_experts_per_tok=2, | ||
n_routed_experts=4, | ||
n_shared_experts: bool = True, | ||
scoring_func='softmax', | ||
aux_loss_alpha=0.01, | ||
seq_aux=True, | ||
norm_topk_prob=True, | ||
use_moe: bool = False, # 是否使用 MOE,默认为 False | ||
num_experts_per_tok=2, # 每个 token 选择的专家数量,默认为 2 | ||
n_routed_experts=4, # 总的专家数量,默认为 4 | ||
n_shared_experts: bool = True, # 是否使用共享专家,默认为 True | ||
scoring_func='softmax', # 评分函数,默认为 'softmax' | ||
aux_loss_alpha=0.01, # 辅助损失的 alpha 参数,默认为 0.01 | ||
seq_aux=True, # 是否在序列级别上计算辅助损失,默认为 True | ||
norm_topk_prob=True, # 是否标准化 top-k 概率,默认为 True | ||
**kwargs, | ||
): | ||
self.dim = dim | ||
self.n_layers = n_layers | ||
self.n_heads = n_heads | ||
self.n_kv_heads = n_kv_heads | ||
self.vocab_size = vocab_size | ||
self.hidden_dim = hidden_dim | ||
self.multiple_of = multiple_of | ||
self.norm_eps = norm_eps | ||
self.max_seq_len = max_seq_len | ||
self.dropout = dropout | ||
self.flash_attn = flash_attn | ||
self.dim = dim # 设置模型维度 | ||
self.n_layers = n_layers # 设置 Transformer 层数 | ||
self.n_heads = n_heads # 设置注意力头数 | ||
self.n_kv_heads = n_kv_heads # 设置 KV 头数 | ||
self.vocab_size = vocab_size # 设置词汇表大小 | ||
self.hidden_dim = hidden_dim # 设置隐藏层维度 | ||
self.multiple_of = multiple_of # 设置隐藏层维度的倍数 | ||
self.norm_eps = norm_eps # 设置归一化层的 epsilon 值 | ||
self.max_seq_len = max_seq_len # 设置最大序列长度 | ||
self.dropout = dropout # 设置 Dropout 概率 | ||
self.flash_attn = flash_attn # 设置是否使用 Flash Attention | ||
#################################################### | ||
# Here are the specific configurations of MOE | ||
# When use_moe is false, the following is invalid | ||
# 以下是 MOE(Mixture of Experts)的特定配置 | ||
# 当 use_moe 为 False 时,以下配置无效 | ||
#################################################### | ||
self.use_moe = use_moe | ||
self.num_experts_per_tok = num_experts_per_tok # 每个token选择的专家数量 | ||
self.n_routed_experts = n_routed_experts # 总的专家数量 | ||
self.n_shared_experts = n_shared_experts # 共享专家 | ||
self.scoring_func = scoring_func # 评分函数,默认为'softmax' | ||
self.aux_loss_alpha = aux_loss_alpha # 辅助损失的alpha参数 | ||
self.seq_aux = seq_aux # 是否在序列级别上计算辅助损失 | ||
self.norm_topk_prob = norm_topk_prob # 是否标准化top-k概率 | ||
super().__init__(**kwargs) | ||
self.use_moe = use_moe # 设置是否使用 MOE | ||
self.num_experts_per_tok = num_experts_per_tok # 设置每个 token 选择的专家数量 | ||
self.n_routed_experts = n_routed_experts # 设置总的专家数量 | ||
self.n_shared_experts = n_shared_experts # 设置是否使用共享专家 | ||
self.scoring_func = scoring_func # 设置评分函数 | ||
self.aux_loss_alpha = aux_loss_alpha # 设置辅助损失的 alpha 参数 | ||
self.seq_aux = seq_aux # 设置是否在序列级别上计算辅助损失 | ||
self.norm_topk_prob = norm_topk_prob # 设置是否标准化 top-k 概率 | ||
super().__init__(**kwargs) # 调用父类 PretrainedConfig 的初始化方法 |
Oops, something went wrong.