|
@@ -82,6 +82,8 @@ struct unity_model {
|
|
|
// audio encoder
|
|
|
struct ggml_tensor * post_extract_proj;
|
|
|
struct ggml_tensor * audio_enc_pos_conv;
|
|
|
+ struct ggml_tensor * memory_k;
|
|
|
+ struct ggml_tensor * memory_v;
|
|
|
std::vector<audio_enc_layer> audio_enc_layers;
|
|
|
|
|
|
// text encoder
|
|
@@ -166,6 +168,7 @@ bool unity_model_load(const std::string & fname, unity_model & model, gpt_vocab
|
|
|
const int n_audio_enc_dim = hparams.n_audio_enc_dim;
|
|
|
const int n_audio_enc_ffn_dim = hparams.n_audio_enc_ffn_dim;
|
|
|
const int n_audio_enc_layer = hparams.n_audio_enc_layer;
|
|
|
+ const int n_ctx = 1500; // 20ms * 1500 = 30s
|
|
|
// const int n_text_vocab = hparams.n_text_vocab;
|
|
|
const int kernel_size = 31;
|
|
|
|
|
@@ -192,6 +195,9 @@ bool unity_model_load(const std::string & fname, unity_model & model, gpt_vocab
|
|
|
ctx_size += n_audio_enc_layer*(n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // final_layer_norm_w
|
|
|
ctx_size += n_audio_enc_layer*(n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // final_layer_norm_b
|
|
|
|
|
|
+ ctx_size += n_ctx*n_audio_enc_layer*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // memory_k
|
|
|
+ ctx_size += n_ctx*n_audio_enc_layer*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // memory_v
|
|
|
+
|
|
|
// Adaptor
|
|
|
// ctx_size += n_audio_enc_layer*(n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // conv_ln
|
|
|
// ctx_size += n_audio_enc_layer*(n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // conv_pool_1d
|
|
@@ -401,7 +407,9 @@ bool unity_model_load(const std::string & fname, unity_model & model, gpt_vocab
|
|
|
}
|
|
|
|
|
|
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
|
|
-
|
|
|
+ // for (int i = 0; i < 10; ++i) {
|
|
|
+ // std::cout << ((float *)(tensor->data))[i] << std::endl;
|
|
|
+ // } // debug
|
|
|
total_size += ggml_nbytes(tensor);
|
|
|
}
|
|
|
|
|
@@ -413,13 +421,105 @@ bool unity_model_load(const std::string & fname, unity_model & model, gpt_vocab
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
+// build the computation graph
|
|
|
+struct ggml_cgraph * unity_graph(
|
|
|
+ const unity_model & model,
|
|
|
+ struct ggml_allocr * allocr) {
|
|
|
+
|
|
|
+ const auto & hparams = model.hparams;
|
|
|
+
|
|
|
+ const int n_audio_enc_dim = hparams.n_audio_enc_dim;
|
|
|
+ const int n_audio_enc_ffn_dim = hparams.n_audio_enc_ffn_dim;
|
|
|
+ const int n_audio_enc_layer = hparams.n_audio_enc_layer;
|
|
|
+ // const int n_text_vocab = hparams.n_text_vocab;
|
|
|
+ const int kernel_size = 31;
|
|
|
+
|
|
|
+ // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
|
|
|
+ static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead();
|
|
|
+ static std::vector<uint8_t> buf(buf_size);
|
|
|
+
|
|
|
+ struct ggml_init_params params = {
|
|
|
+ /*.mem_size =*/ buf_size,
|
|
|
+ /*.mem_buffer =*/ buf.data(),
|
|
|
+ /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
|
|
|
+ };
|
|
|
+
|
|
|
+ struct ggml_context * ctx0 = ggml_init(params);
|
|
|
+
|
|
|
+ struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
|
|
+
|
|
|
+ /// For dev, load an example input before conformer blocks
|
|
|
+ auto file = std::ifstream("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/dev/seqs_before_conformer_block.bin", std::ios::binary);
|
|
|
+ if (!file) {
|
|
|
+ std::cerr << "Failed to open binary file." << std::endl;
|
|
|
+ }
|
|
|
+ struct ggml_tensor * inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1024, 137);
|
|
|
+ inpL->data = malloc(ggml_nbytes(inpL));
|
|
|
+ file.read(reinterpret_cast<char *>(inpL->data), ggml_nbytes(inpL));
|
|
|
+
|
|
|
+ for (int il = 0; il < n_audio_enc_layer; ++il) {
|
|
|
+ struct ggml_tensor * cur = inpL;
|
|
|
+ cur = ggml_norm(ctx0, cur, hparams.eps);
|
|
|
+ cur = ggml_add(ctx0,
|
|
|
+ ggml_mul(ctx0,
|
|
|
+ ggml_repeat(ctx0, model.audio_enc_layers[il].ffn1_layer_norm_w, cur),
|
|
|
+ cur),
|
|
|
+ ggml_repeat(ctx0, model.audio_enc_layers[il].ffn1_layer_norm_b, cur));
|
|
|
+
|
|
|
+ // self_attn
|
|
|
+
|
|
|
+ // conv
|
|
|
+
|
|
|
+ // ffn2
|
|
|
+
|
|
|
+ // norm
|
|
|
+
|
|
|
+ inpL = cur;
|
|
|
+ }
|
|
|
+
|
|
|
+ ggml_build_forward_expand(gf, inpL);
|
|
|
+ ggml_free(ctx0);
|
|
|
+
|
|
|
+ return gf;
|
|
|
+}
|
|
|
+
|
|
|
+bool unity_eval(
|
|
|
+ const unity_model & model,
|
|
|
+ struct ggml_allocr * allocr,
|
|
|
+ const int n_threads) {
|
|
|
+
|
|
|
+ const auto & hparams = model.hparams;
|
|
|
+
|
|
|
+ // reset the allocator to free all the memory allocated during the previous inference
|
|
|
+ ggml_allocr_reset(allocr);
|
|
|
+
|
|
|
+ struct ggml_cgraph * gf = unity_graph(model, allocr);
|
|
|
+
|
|
|
+ // allocate tensors
|
|
|
+ ggml_allocr_alloc_graph(allocr, gf);
|
|
|
+
|
|
|
+ // run the computation
|
|
|
+ struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
|
|
|
+ static std::vector<uint8_t> work_buffer;
|
|
|
+ work_buffer.resize(plan.work_size);
|
|
|
+ plan.work_data = work_buffer.data();
|
|
|
+ ggml_graph_compute(gf, &plan);
|
|
|
+
|
|
|
+ // in this case, the output tensor is the last one in the graph
|
|
|
+ struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
|
|
|
+ for (int i = 0; i < 10; ++i) {
|
|
|
+ printf("%8.4f ", ((float *)(inpL->data))[i]);
|
|
|
+ }
|
|
|
+
|
|
|
+ return true;
|
|
|
+}
|
|
|
+
|
|
|
int main(int argc, char ** argv) {
|
|
|
// ggml_time_init();
|
|
|
|
|
|
// const int64_t t_main_start_us = ggml_time_us();
|
|
|
|
|
|
gpt_params params;
|
|
|
- params.model = "models/gpt-2-117M/ggml-model.bin";
|
|
|
|
|
|
if (gpt_params_parse(argc, argv, params) == false) {
|
|
|
return 1;
|
|
@@ -436,48 +536,41 @@ int main(int argc, char ** argv) {
|
|
|
params.prompt = gpt_random_prompt(rng);
|
|
|
}
|
|
|
|
|
|
- // int64_t t_load_us = 0;
|
|
|
-
|
|
|
gpt_vocab vocab;
|
|
|
unity_model model;
|
|
|
|
|
|
// load the model
|
|
|
{
|
|
|
- // const int64_t t_start_us = ggml_time_us();
|
|
|
-
|
|
|
if (!unity_model_load(params.model, model, vocab)) {
|
|
|
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
|
|
return 1;
|
|
|
}
|
|
|
-
|
|
|
- // t_load_us = ggml_time_us() - t_start_us;
|
|
|
-
|
|
|
- // test_gpt_tokenizer(vocab, params.token_test);
|
|
|
}
|
|
|
|
|
|
// keep this buffer alive while evaluating the model
|
|
|
- // std::vector<uint8_t> compute_buffer;
|
|
|
-
|
|
|
- // struct ggml_allocr * allocr = NULL;
|
|
|
- // // allocate the compute buffer
|
|
|
- // {
|
|
|
- // allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN);
|
|
|
+ std::vector<uint8_t> compute_buffer;
|
|
|
+ struct ggml_allocr * allocr = NULL;
|
|
|
+ // allocate the compute buffer
|
|
|
+ {
|
|
|
+ allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN);
|
|
|
+ struct ggml_cgraph * gf = unity_graph(model, allocr);
|
|
|
+
|
|
|
|
|
|
- // // create the worst case graph for memory usage estimation
|
|
|
- // int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
|
|
|
- // int n_past = model.hparams.n_ctx - n_tokens;
|
|
|
- // struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
|
|
|
+ // compute the required memory
|
|
|
+ size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN;
|
|
|
|
|
|
- // // compute the required memory
|
|
|
- // size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN;
|
|
|
+ // recreate the allocator with the required memory
|
|
|
+ ggml_allocr_free(allocr);
|
|
|
+ compute_buffer.resize(mem_size);
|
|
|
+ allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN);
|
|
|
|
|
|
- // // recreate the allocator with the required memory
|
|
|
- // ggml_allocr_free(allocr);
|
|
|
- // compute_buffer.resize(mem_size);
|
|
|
- // allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN);
|
|
|
+ fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
|
|
|
+ }
|
|
|
|
|
|
- // fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
|
|
|
- // }
|
|
|
+ if (!unity_eval(model, allocr, 1)) {
|
|
|
+ printf("Failed to predict\n");
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
|
|
|
ggml_free(model.ctx);
|
|
|
|