|
@@ -129,132 +129,9 @@ void read_unity_hparams(unity_hparams* out, std::ifstream &fin) {
|
|
|
|
|
|
};
|
|
|
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-// Embedding
|
|
|
-std::size_t compute_embed_size(int32_t vocab_size, int32_t dim)
|
|
|
-{
|
|
|
- return vocab_size * dim * ggml_type_size(GGML_TYPE_F32);
|
|
|
-};
|
|
|
-
|
|
|
-// Attention Layer
|
|
|
-
|
|
|
-struct attention_layer {
|
|
|
- struct ggml_tensor* layer_norm_w; // model_dim
|
|
|
- struct ggml_tensor* layer_norm_b; // model_dim
|
|
|
-
|
|
|
- struct ggml_tensor* q_proj_w; // model_dim x model_dim
|
|
|
- struct ggml_tensor* q_proj_b; // model_dim
|
|
|
- struct ggml_tensor* k_proj_w; // model_dim x model_dim
|
|
|
- struct ggml_tensor* k_proj_b; // model_dim
|
|
|
- struct ggml_tensor* v_proj_w; // model_dim x model_dim
|
|
|
- struct ggml_tensor* v_proj_b; // model_dim
|
|
|
-
|
|
|
- struct ggml_tensor* output_proj_w; // model_dim x model_dim
|
|
|
- struct ggml_tensor* output_proj_b; // model_dim
|
|
|
-};
|
|
|
-
|
|
|
-std::size_t compute_attention_layer_size(int32_t dim)
|
|
|
-{
|
|
|
- return LayerNorm_size(dim)
|
|
|
- + 4 * Linear_size(dim, dim); // q, k, v, and out
|
|
|
-};
|
|
|
-
|
|
|
-void init_attention_layer(
|
|
|
- attention_layer *layer,
|
|
|
- fairseq2_model &model_ctx,
|
|
|
- const std::string &prefix)
|
|
|
-{
|
|
|
- auto hparams = (unity_hparams&)model_ctx.hparams;
|
|
|
- const auto dim = hparams.nllb_config__model_dim;
|
|
|
- auto ctx = model_ctx.ctx;
|
|
|
- auto &tensor_map = model_ctx.tensors;
|
|
|
-
|
|
|
- layer->layer_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dim);
|
|
|
- tensor_map[prefix + "_layer_norm.weight"] = layer->layer_norm_w;
|
|
|
- layer->layer_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dim);
|
|
|
- tensor_map[prefix + "_layer_norm.bias"] = layer->layer_norm_b;
|
|
|
-
|
|
|
- layer->q_proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, dim);
|
|
|
- tensor_map[prefix + ".q_proj.weight"] = layer->q_proj_w;
|
|
|
- layer->q_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dim);
|
|
|
- tensor_map[prefix + ".q_proj.bias"] = layer->q_proj_b;
|
|
|
-
|
|
|
- layer->k_proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, dim);
|
|
|
- tensor_map[prefix + ".k_proj.weight"] = layer->k_proj_w;
|
|
|
- layer->k_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dim);
|
|
|
- tensor_map[prefix + ".k_proj.bias"] = layer->k_proj_b;
|
|
|
-
|
|
|
- layer->v_proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, dim);
|
|
|
- tensor_map[prefix + ".v_proj.weight"] = layer->v_proj_w;
|
|
|
- layer->v_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dim);
|
|
|
- tensor_map[prefix + ".v_proj.bias"] = layer->v_proj_b;
|
|
|
-
|
|
|
- layer->output_proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, dim);
|
|
|
- tensor_map[prefix + ".output_proj.weight"] = layer->output_proj_w;
|
|
|
- layer->output_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dim);
|
|
|
- tensor_map[prefix + ".output_proj.bias"] = layer->output_proj_b;
|
|
|
-}
|
|
|
-
|
|
|
-
|
|
|
-// Attention Head
|
|
|
-
|
|
|
-struct attention_head {
|
|
|
- struct attention_layer* self_attn; // model_dim
|
|
|
- struct attention_layer* encoder_decoder_attn; // model_dim
|
|
|
- struct StandardFeedForwardNetwork* ffn;
|
|
|
-};
|
|
|
-
|
|
|
-std::size_t compute_attention_head_size(int32_t dim, int32_t inner_dim)
|
|
|
-{
|
|
|
- return 2 * compute_attention_layer_size(dim) + StandardFeedForwardNetwork_size(dim, inner_dim);
|
|
|
-};
|
|
|
-
|
|
|
-void init_attention_head(
|
|
|
- attention_head *head,
|
|
|
- fairseq2_model &model_ctx,
|
|
|
- const std::string &prefix)
|
|
|
-{
|
|
|
- auto hparams = (unity_hparams&)model_ctx.hparams;
|
|
|
- init_attention_layer(head->self_attn, model_ctx, prefix + ".self_attn");
|
|
|
- init_attention_layer(head->encoder_decoder_attn, model_ctx, prefix + ".encoder_decoder_attn");
|
|
|
- StandardFeedForwardNetwork_init((StandardFeedForwardNetwork&)(head->ffn), model_ctx, prefix + ".ffn", hparams.nllb_config__model_dim, hparams.nllb_config__ffn_inner_dim);
|
|
|
-}
|
|
|
-
|
|
|
-// TODO: attention_head_compute_graph
|
|
|
-
|
|
|
-// Text Decoder
|
|
|
-
|
|
|
-struct text_decoder {
|
|
|
- struct ggml_tensor* frontend_embed_w; // vocab_size x model_dim
|
|
|
- std::vector<attention_head*> multi_head;
|
|
|
- struct ggml_tensor* layer_norm_w;
|
|
|
- struct ggml_tensor* layer_norm_b;
|
|
|
-};
|
|
|
-
|
|
|
-std::size_t compute_context_size(void* raw_hparams)
|
|
|
-{
|
|
|
- auto hparams = (unity_hparams&)raw_hparams;
|
|
|
- const auto vocab_size = hparams.nllb_config__vocabulary_size;
|
|
|
- const auto dim = hparams.nllb_config__model_dim;
|
|
|
- const auto inner_dim = hparams.nllb_config__ffn_inner_dim;
|
|
|
- const auto n_layers = hparams.nllb_config__num_decoder_layers;
|
|
|
-
|
|
|
- const auto overhead = (6 + 12 * n_layers) * 512; // TODO Find out what this is.
|
|
|
-
|
|
|
- return compute_embed_size(vocab_size, dim)
|
|
|
- + n_layers * compute_attention_head_size(dim, inner_dim)
|
|
|
- + LayerNorm_size(dim)
|
|
|
- + overhead;
|
|
|
-};
|
|
|
-
|
|
|
class unity_model_loader: public model_loader {
|
|
|
public:
|
|
|
void load_hparams(fairseq2_model& model, std::ifstream &fin);
|
|
|
|
|
|
std::size_t compute_context_size(void* raw_hparams);
|
|
|
-
|
|
|
- void tensors_alloc(fairseq2_model &model);
|
|
|
};
|