ソースを参照

rm examples/*

Guillaume Wenzek 1 年間 前
コミット
d5d2ec8f12
80 ファイル変更0 行追加33509 行削除
  1. 0 10
      ggml/examples/CMakeLists.txt
  2. 0 13
      ggml/examples/dolly-v2/CMakeLists.txt
  3. 0 187
      ggml/examples/dolly-v2/README.md
  4. 0 116
      ggml/examples/dolly-v2/convert-h5-to-ggml.py
  5. 0 962
      ggml/examples/dolly-v2/main.cpp
  6. 0 178
      ggml/examples/dolly-v2/quantize.cpp
  7. 0 13
      ggml/examples/gpt-2/CMakeLists.txt
  8. 0 158
      ggml/examples/gpt-2/README.md
  9. 0 183
      ggml/examples/gpt-2/convert-cerebras-to-ggml.py
  10. 0 159
      ggml/examples/gpt-2/convert-ckpt-to-ggml.py
  11. 0 195
      ggml/examples/gpt-2/convert-h5-to-ggml.py
  12. 0 69
      ggml/examples/gpt-2/download-ggml-model.sh
  13. 0 48
      ggml/examples/gpt-2/download-model.sh
  14. 0 892
      ggml/examples/gpt-2/main.cpp
  15. 0 184
      ggml/examples/gpt-2/quantize.cpp
  16. 0 13
      ggml/examples/gpt-j/CMakeLists.txt
  17. 0 246
      ggml/examples/gpt-j/README.md
  18. 0 173
      ggml/examples/gpt-j/convert-h5-to-ggml.py
  19. 0 69
      ggml/examples/gpt-j/download-ggml-model.sh
  20. 0 11
      ggml/examples/gpt-j/download-model.sh
  21. 0 748
      ggml/examples/gpt-j/main.cpp
  22. 0 182
      ggml/examples/gpt-j/quantize.cpp
  23. 0 13
      ggml/examples/gpt-neox/CMakeLists.txt
  24. 0 110
      ggml/examples/gpt-neox/README.md
  25. 0 107
      ggml/examples/gpt-neox/convert-h5-to-ggml.py
  26. 0 814
      ggml/examples/gpt-neox/main.cpp
  27. 0 178
      ggml/examples/gpt-neox/quantize.cpp
  28. 0 40
      ggml/examples/mnist/CMakeLists.txt
  29. 0 128
      ggml/examples/mnist/README.md
  30. 0 63
      ggml/examples/mnist/convert-h5-to-ggml.py
  31. 0 169
      ggml/examples/mnist/main-cnn.cpp
  32. 0 122
      ggml/examples/mnist/main-cpu.cpp
  33. 0 125
      ggml/examples/mnist/main-mtl.cpp
  34. 0 26
      ggml/examples/mnist/main-mtl.h
  35. 0 499
      ggml/examples/mnist/main-mtl.m
  36. 0 328
      ggml/examples/mnist/main.cpp
  37. 0 101
      ggml/examples/mnist/mnist-cnn.py
  38. 0 1
      ggml/examples/mnist/models/mnist/.gitignore
  39. BIN
      ggml/examples/mnist/models/mnist/mnist_model.state_dict
  40. BIN
      ggml/examples/mnist/models/mnist/t10k-images.idx3-ubyte
  41. 0 13
      ggml/examples/mpt/CMakeLists.txt
  42. 0 27
      ggml/examples/mpt/README.md
  43. 0 169
      ggml/examples/mpt/convert-h5-to-ggml.py
  44. 0 1039
      ggml/examples/mpt/main.cpp
  45. 0 186
      ggml/examples/mpt/quantize.cpp
  46. 0 100
      ggml/examples/prompts/dolly-v2.txt
  47. 0 1
      ggml/examples/prompts/gpt-2-chinese.txt
  48. 0 100
      ggml/examples/prompts/gpt-2.txt
  49. 0 100
      ggml/examples/prompts/gpt-j.txt
  50. 0 1
      ggml/examples/prompts/gpt-neox-japanese.txt
  51. 0 100
      ggml/examples/prompts/gpt-neox.txt
  52. 0 3
      ggml/examples/prompts/polyglot-ko.txt
  53. 0 100
      ggml/examples/prompts/replit.txt
  54. 0 100
      ggml/examples/prompts/starcoder.txt
  55. 0 110
      ggml/examples/prompts/test-cases.txt
  56. 0 65
      ggml/examples/prompts/tokenize_huggingface.py
  57. 0 100
      ggml/examples/prompts/whisper.txt
  58. 0 13
      ggml/examples/replit/CMakeLists.txt
  59. 0 117
      ggml/examples/replit/convert-h5-to-ggml.py
  60. 0 795
      ggml/examples/replit/main.cpp
  61. 0 182
      ggml/examples/replit/quantize.cpp
  62. 0 13
      ggml/examples/sam/CMakeLists.txt
  63. 0 105
      ggml/examples/sam/README.md
  64. 0 147
      ggml/examples/sam/convert-pth-to-ggml.py
  65. 0 2201
      ggml/examples/sam/main.cpp
  66. 0 31
      ggml/examples/starcoder/CMakeLists.txt
  67. 0 115
      ggml/examples/starcoder/README.md
  68. 0 208
      ggml/examples/starcoder/convert-hf-to-ggml.py
  69. 0 927
      ggml/examples/starcoder/main.cpp
  70. 0 184
      ggml/examples/starcoder/quantize.cpp
  71. 0 1127
      ggml/examples/starcoder/starcoder-mmap.cpp
  72. 0 7987
      ggml/examples/stb_image.h
  73. 0 1724
      ggml/examples/stb_image_write.h
  74. 0 22
      ggml/examples/whisper/CMakeLists.txt
  75. 0 29
      ggml/examples/whisper/README.md
  76. 0 342
      ggml/examples/whisper/convert-pt-to-ggml.py
  77. 0 1024
      ggml/examples/whisper/main.cpp
  78. 0 223
      ggml/examples/whisper/quantize.cpp
  79. 0 5515
      ggml/examples/whisper/whisper.cpp
  80. 0 531
      ggml/examples/whisper/whisper.h

+ 0 - 10
ggml/examples/CMakeLists.txt

@@ -18,14 +18,4 @@ add_library(common-ggml STATIC common-ggml.cpp)
 target_link_libraries(common-ggml PRIVATE ggml)
 target_include_directories(common-ggml PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 
-add_subdirectory(gpt-2)
-add_subdirectory(gpt-j)
-add_subdirectory(whisper)
-add_subdirectory(mnist)
-add_subdirectory(gpt-neox)
-add_subdirectory(dolly-v2)
-add_subdirectory(replit)
-add_subdirectory(mpt)
-add_subdirectory(starcoder)
-add_subdirectory(sam)
 add_subdirectory(unity)

+ 0 - 13
ggml/examples/dolly-v2/CMakeLists.txt

@@ -1,13 +0,0 @@
-#
-# dollyv2
-
-set(TEST_TARGET dollyv2)
-add_executable(${TEST_TARGET} main.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
-#
-# dollyv2-quantize
-
-set(TEST_TARGET dollyv2-quantize)
-add_executable(${TEST_TARGET} quantize.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)

+ 0 - 187
ggml/examples/dolly-v2/README.md

@@ -1,187 +0,0 @@
-# Dolly-V2
-
-Transformer architecture: GPT-NeoX
-
-Modeled from examples/stablelm
-
-Ref: https://github.com/databrickslabs/dolly
-
-Ref: https://github.com/stability-AI/stableLM/#stablelm-alpha
-
-## Usage
-
-```bash
-# get the repo and build it
-git clone https://github.com/ggerganov/ggml
-cd ggml
-mkdir build && cd build
-cmake ..
-make -j
-
-# get the Dolly-V2 3B model
-git clone https://huggingface.co/databricks/dolly-v2-3b
-
-# install Python dependencies
-python3 -m pip install -r ../requirements.txt
-
-# convert model to FP16
-python3 ../examples/dolly-v2/convert-h5-to-ggml.py ./dolly-v2-3b/ 1
-
-# run inference using FP16 precision
-./bin/dollyv2 -m ./dolly-v2-3b/ggml-model-f16.bin -p "State the meaning of life." -t 6 -n 64
-
-main: seed = 1683218142
-dollyv2_model_load: loading model from './dolly-v2-3b/ggml-model-f16.bin' - please wait ...
-dollyv2_model_load: n_vocab = 50280
-dollyv2_model_load: n_ctx   = 2048
-dollyv2_model_load: n_embd  = 2560
-dollyv2_model_load: n_head  = 32
-dollyv2_model_load: n_layer = 32
-dollyv2_model_load: n_rot   = 20
-dollyv2_model_load: ftype   = 1
-dollyv2_model_load: ggml ctx size = 7374.91 MB
-dollyv2_model_load: memory_size =   640.00 MB, n_mem = 65536
-dollyv2_model_load: ................................................ done
-dollyv2_model_load: model size =  5295.10 MB / num tensors = 388
-main: number of tokens in prompt = 32
-main: token[0] =  30003, Below
-main: token[1] =    310,  is
-main: token[2] =    271,  an
-main: token[3] =   9775,  instruction
-main: token[4] =    326,  that
-main: token[5] =   8631,  describes
-main: token[6] =    247,  a
-main: token[7] =   4836,  task
-main: token[8] =    964, .
-main: token[9] =  19566,  Write
-main: token[10] =    247,  a
-main: token[11] =   2380,  response
-main: token[12] =    326,  that
-main: token[13] =  20420,  appropriately
-main: token[14] =  29141,  completes
-main: token[15] =    253,  the
-main: token[16] =   2748,  request
-main: token[17] =    964, .
-main: token[18] =    187, 
-
-main: token[19] =    187, 
-
-main: token[20] =  50278, ### Instruction:
-main: token[21] =    187, 
-
-main: token[22] =   5443, State
-main: token[23] =    253,  the
-main: token[24] =   4495,  meaning
-main: token[25] =    273,  of
-main: token[26] =   1495,  life
-main: token[27] =    964, .
-main: token[28] =    187, 
-
-main: token[29] =    187, 
-
-main: token[30] =  50279, ### Response:
-main: token[31] =    187, 
-
-
-Below is an instruction that describes a task. Write a response that appropriately completes the request.
-
-### Instruction:
-State the meaning of life.
-
-### Response:
-The meaning of life is to love and be loved.
-
-### End
-
-main: mem per token = 16136720 bytes
-main:     load time =  2202.58 ms
-main:   sample time =     2.57 ms
-main:  predict time =  1497.14 ms / 33.27 ms per token
-main:    total time =  6187.27 ms
-```
-
-## 5-bit integer quantization mode
-
-```bash
-# quantize the model to 5-bits using Q5_0 quantization
-./bin/dollyv2-quantize ./dolly-v2-3b/ggml-model-f16.bin ./dolly-v2-3b/ggml-model-q5_0.bin q5_0
-
-# run the quantized model
-./bin/dollyv2 -m ./dolly-v2-3b/ggml-model-q5_0.bin -p "State the meaning of life." -t 6 -n 64
-
-main: seed = 1683218518
-dollyv2_model_load: loading model from './dolly-v2-3b/ggml-model-q5_0.bin' - please wait ...
-dollyv2_model_load: n_vocab = 50280
-dollyv2_model_load: n_ctx   = 2048
-dollyv2_model_load: n_embd  = 2560
-dollyv2_model_load: n_head  = 32
-dollyv2_model_load: n_layer = 32
-dollyv2_model_load: n_rot   = 20
-dollyv2_model_load: ftype   = 8
-dollyv2_model_load: ggml ctx size = 3902.68 MB
-dollyv2_model_load: memory_size =   640.00 MB, n_mem = 65536
-dollyv2_model_load: ................................................ done
-dollyv2_model_load: model size =  1822.87 MB / num tensors = 388
-main: number of tokens in prompt = 32
-main: token[0] =  30003, Below
-main: token[1] =    310,  is
-main: token[2] =    271,  an
-main: token[3] =   9775,  instruction
-main: token[4] =    326,  that
-main: token[5] =   8631,  describes
-main: token[6] =    247,  a
-main: token[7] =   4836,  task
-main: token[8] =    964, .
-main: token[9] =  19566,  Write
-main: token[10] =    247,  a
-main: token[11] =   2380,  response
-main: token[12] =    326,  that
-main: token[13] =  20420,  appropriately
-main: token[14] =  29141,  completes
-main: token[15] =    253,  the
-main: token[16] =   2748,  request
-main: token[17] =    964, .
-main: token[18] =    187, 
-
-main: token[19] =    187, 
-
-main: token[20] =  50278, ### Instruction:
-main: token[21] =    187, 
-
-main: token[22] =   5443, State
-main: token[23] =    253,  the
-main: token[24] =   4495,  meaning
-main: token[25] =    273,  of
-main: token[26] =   1495,  life
-main: token[27] =    964, .
-main: token[28] =    187, 
-
-main: token[29] =    187, 
-
-main: token[30] =  50279, ### Response:
-main: token[31] =    187, 
-
-
-Below is an instruction that describes a task. Write a response that appropriately completes the request.
-
-### Instruction:
-State the meaning of life.
-
-### Response:
-The meaning of life is the discovery of the true self.
-
-### End
-
-main: mem per token = 16127760 bytes
-main:     load time =  1011.09 ms
-main:   sample time =     2.79 ms
-main:  predict time =  1271.62 ms / 27.64 ms per token
-main:    total time =  2802.51 ms
-```
-
-## Notes
-
-- No guarantees for correctness
-- The tokenizer is currently hacked - probably works only for English
-- Non-parallel residual is not supported
-- Contributions and improvements are welcome

+ 0 - 116
ggml/examples/dolly-v2/convert-h5-to-ggml.py

@@ -1,116 +0,0 @@
-import sys
-import struct
-import json
-import numpy as np
-
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-if len(sys.argv) < 3:
-    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
-    print("  ftype == 0 -> float32")
-    print("  ftype == 1 -> float16")
-    sys.exit(1)
-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-fname_out = sys.argv[1] + "/ggml-model.bin"
-
-with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
-    encoder = json.load(f)
-
-with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-# possible data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-#
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-ftype = 1
-if len(sys.argv) > 2:
-    ftype = int(sys.argv[2])
-    if ftype < 0 or ftype > 1:
-        print("Invalid ftype: " + str(ftype))
-        sys.exit(1)
-    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
-
-
-tokenizer = AutoTokenizer.from_pretrained(dir_model)
-model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
-#print (model)
-
-#print(tokenizer.encode('I believe the meaning of life is'))
-
-list_vars = model.state_dict()
-for name in list_vars.keys():
-    print(name, list_vars[name].shape, list_vars[name].dtype)
-
-fout = open(fname_out, "wb")
-
-print(hparams)
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-fout.write(struct.pack("i", hparams["vocab_size"]))
-fout.write(struct.pack("i", hparams["max_position_embeddings"]))
-fout.write(struct.pack("i", hparams["hidden_size"]))
-fout.write(struct.pack("i", hparams["num_attention_heads"]))
-fout.write(struct.pack("i", hparams["num_hidden_layers"]))
-fout.write(struct.pack("i", int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"]))))
-fout.write(struct.pack("i", hparams["use_parallel_residual"]))
-fout.write(struct.pack("i", ftype))
-
-# TODO: temporary hack to not deal with implementing the tokenizer
-dot_token = tokenizer.encode('.')[0]
-for i in range(hparams["vocab_size"]):
-    text = tokenizer.decode([dot_token, i]).encode('utf-8')
-    # remove the first byte (it's always '.')
-    text = text[1:]
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-
-for name in list_vars.keys():
-    data = list_vars[name].squeeze().numpy()
-    print("Processing variable: " + name + " with shape: ", data.shape)
-
-    # we don't need these
-    if name.endswith(".attention.masked_bias") or     \
-       name.endswith(".attention.bias") or \
-       name.endswith(".attention.rotary_emb.inv_freq"):
-        print("  Skipping variable: " + name)
-        continue
-
-    n_dims = len(data.shape);
-
-    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype_cur = 0;
-    if ftype != 0:
-        if name[-7:] == ".weight" and n_dims == 2:
-            print("  Converting to float16")
-            data = data.astype(np.float16)
-            ftype_cur = 1
-        else:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-    else:
-        if data.dtype != np.float32:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-
-    # header
-    str = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    fout.write(str);
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")

+ 0 - 962
ggml/examples/dolly-v2/main.cpp

@@ -1,962 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <cinttypes>
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <string>
-#include <vector>
-
-#if !defined(_WIN32)
-#define DOLLY_INTERACTIVE_PORT
-#endif
-
-#if defined(DOLLY_INTERACTIVE_PORT)
-#include <arpa/inet.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <unistd.h>
-#endif
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// default hparams (Dolly-V2 3B)
-struct dollyv2_hparams {
-    int32_t n_vocab = 50254; // tokenizer.vocab_size
-    int32_t n_ctx   = 2048;  // model.config.max_position_embeddings
-    int32_t n_embd  = 2560;  // model.config.hidden_size
-    int32_t n_head  = 32;    // model.config.num_attention_heads
-    int32_t n_layer = 32;    // model.config.num_hidden_layers
-    int32_t n_rot   = 20;    // rotary_pct[25%] * (n_embd / n_head)
-    int32_t par_res = 1; // 1 = true, 0 = false
-    int32_t ftype   = GGML_FTYPE_MOSTLY_F16;
-    float   eps     = 1e-5f;
-};
-
-const std::string INSTRUCTION_KEY = "### Instruction:";
-const std::string RESPONSE_KEY    = "### Response:";
-const std::string END_KEY         = "### End";
-const std::string INTRO_BLURB     = "Below is an instruction that describes a task. Write a response that appropriately completes the request.";
-
-// dollyv2 prompt format
-std::string prompt_for_generation(const std::string& instruction) {
-    return INTRO_BLURB + "\n\n" + INSTRUCTION_KEY + "\n" + instruction + "\n\n" + RESPONSE_KEY + "\n";
-}
-
-struct dollyv2_layer {
-    // pre normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // post normalization
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // ff
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct dollyv2_model {
-    dollyv2_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte; // position embedding
-
-    struct ggml_tensor * lmh_g; // language model head
-    //struct ggml_tensor * lmh_b; // language model bias
-
-    std::vector<dollyv2_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-// load the model's weights from a file
-bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vocab & vocab) {
-    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fin.read((char *) &hparams.par_res, sizeof(hparams.par_res));
-        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
-        printf("%s: par_res = %d\n", __func__, hparams.par_res);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr   = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    {
-        const int32_t n_vocab = model.hparams.n_vocab;
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *) buf.data(), len);
-            word.assign(buf.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-
-        vocab.add_special_token("### End");
-        vocab.add_special_token("### Instruction:");
-        vocab.add_special_token("### Response:");
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
-
-        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte
-
-        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype);           // lmh_g
-        //ctx_size +=        n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
-
-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
-
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_proj_w
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
-
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
-
-        ctx_size += (6 + 16*n_layer)*512; // object overhead
-
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        model.wte    = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-
-        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        model.lmh_g  = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        //model.lmh_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab);
-
-        // map by name
-        model.tensors["gpt_neox.embed_in.weight"] = model.wte;
-
-        model.tensors["gpt_neox.final_layer_norm.weight"] = model.ln_f_g;
-        model.tensors["gpt_neox.final_layer_norm.bias"]   = model.ln_f_b;
-
-        model.tensors["embed_out.weight"] = model.lmh_g;
-        //model.tensors["lm_head.bias"]   = model.lmh_b;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.ln_1_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_attn_attn_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
-            layer.c_attn_attn_b   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
-
-            layer.c_attn_proj_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
-            layer.c_attn_proj_b   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.ln_2_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_2_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_mlp_fc_w      = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
-            layer.c_mlp_fc_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-            layer.c_mlp_proj_w    = ggml_new_tensor_2d(ctx, wtype,         4*n_embd,   n_embd);
-            layer.c_mlp_proj_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            // map by name
-
-            // unmapped: attention.rotary_emb, mlp.act
-
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.weight"] = layer.ln_1_g;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.bias"]   = layer.ln_1_b;
-
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.weight"] = layer.c_attn_attn_w;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.bias"]   = layer.c_attn_attn_b;
-
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.weight"] = layer.c_attn_proj_w;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.bias"]   = layer.c_attn_proj_b;
-
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.weight"] = layer.ln_2_g;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.bias"]   = layer.ln_2_b;
-
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.weight"] = layer.c_mlp_fc_w;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.bias"]   = layer.c_mlp_fc_b;
-
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.weight"] = layer.c_mlp_proj_w;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.bias"]   = layer.c_mlp_proj_b;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int64_t n_mem      = n_layer*n_ctx;
-        const int64_t n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-
-    // load weights
-    {
-        int n_tensors = 0;
-        size_t total_size = 0;
-
-        printf("%s: ", __func__);
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            auto tensor = model.tensors[name];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n",
-                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-            total_size += ggml_nbytes(tensor);
-            if (++n_tensors % 8 == 0) {
-                printf(".");
-                fflush(stdout);
-            }
-        }
-
-        printf(" done\n");
-
-        printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// feed-forward network
-ggml_tensor * gpt_neox_ff(
-        const dollyv2_layer & layer,
-        ggml_context        * ctx0,
-        ggml_tensor         * inp,
-        float                 eps) {
-    ggml_tensor * cur = ggml_norm(ctx0, inp, eps);
-
-    cur = ggml_add(ctx0,
-        ggml_mul(ctx0,
-            ggml_repeat(ctx0, layer.ln_2_g, cur),
-            cur),
-        ggml_repeat(ctx0, layer.ln_2_b, cur));
-
-    cur = ggml_mul_mat(ctx0,
-            layer.c_mlp_fc_w,
-            cur);
-
-    cur = ggml_add(ctx0,
-            ggml_repeat(ctx0, layer.c_mlp_fc_b, cur),
-            cur);
-
-    // GELU activation
-    cur = ggml_gelu(ctx0, cur);
-
-    // projection
-    // cur = proj_w*cur + proj_b
-    cur = ggml_mul_mat(ctx0,
-            layer.c_mlp_proj_w,
-            cur);
-
-    cur = ggml_add(ctx0,
-            ggml_repeat(ctx0, layer.c_mlp_proj_b, cur),
-            cur);
-    return cur;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool dollyv2_eval(
-        const dollyv2_model & model,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w,
-              size_t                     & mem_per_token) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-    const int n_vocab = hparams.n_vocab;
-    const int n_rot   = hparams.n_rot;
-
-    static size_t buf_size = 256u*1024*1024;
-    static void * buf = malloc(buf_size);
-
-    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = { };
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-
-    // wte
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        // self-attention
-        {
-            {
-                cur = ggml_norm(ctx0, inpL, hparams.eps);
-
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
-                            cur),
-                        ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
-            }
-
-            // compute QKV
-            {
-                cur = ggml_mul_mat(ctx0,
-                        model.layers[il].c_attn_attn_w,
-                        cur);
-
-                cur = ggml_add(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
-                        cur);
-            }
-
-            struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head));
-            struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head));
-            struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
-
-            // using mode = 2 for GPT-NeoX mode
-            Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2, 0);
-            Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2, 0);
-
-            // store key and value to memory
-            {
-                Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
-
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
-                        (   n_ctx)*ggml_element_size(model.memory_v),
-                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
-
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
-                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
-                        );
-
-            // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, model.memory_v,
-                        n_past + N, n_embd/n_head, n_head,
-                        n_ctx*ggml_element_size(model.memory_v),
-                        n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head,
-                        il*n_ctx*ggml_element_size(model.memory_v)*n_embd);
-
-            // KQV = transpose(V) * KQ_soft_max
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection
-            {
-                cur = ggml_mul_mat(ctx0,
-                        model.layers[il].c_attn_proj_w,
-                        cur);
-
-                cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur);
-            }
-        }
-
-        if (hparams.par_res == 0) {
-            struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
-
-            cur = gpt_neox_ff(model.layers[il], ctx0, inpFF, hparams.eps);
-
-            // input for next layer
-            inpL = ggml_add(ctx0, cur, inpFF);
-        } else {
-            struct ggml_tensor * inpFF = cur;
-
-            // this is independent of the self-attention result, so it could be done in parallel to the self-attention
-            // note here we pass inpL instead of cur
-            cur = gpt_neox_ff(model.layers[il], ctx0, inpL, hparams.eps);
-
-            // layer input + FF
-            cur  = ggml_add(ctx0, cur, inpFF);
-
-            // input for next layer
-            inpL = ggml_add(ctx0, cur, inpL);
-        }
-
-    }
-
-    // norm
-    {
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
-                    inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
-    }
-
-    // lm_head
-    {
-        inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
-
-        //inpL = ggml_add(ctx0,
-        //        ggml_repeat(ctx0, model.lmh_b, inpL),
-        //        inpL);
-    }
-
-    // logits -> probs
-    //inpL = ggml_soft_max_inplace(ctx0, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // return result for just the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
-    }
-    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
-
-    ggml_free(ctx0);
-
-    return true;
-}
-
-std::string execute_prompt(
-        const dollyv2_model &model,
-        gpt_vocab &vocab,
-        const std::string &prompt,
-        gpt_params &params,
-        std::mt19937 &rng,
-        int64_t t_load_us,
-        int64_t t_sample_us,
-        int64_t t_predict_us,
-        size_t mem_per_token,
-        int n_past,
-        bool stream_response_to_cout = false) {
-    std::string output = "";
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, prompt);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int)embd_inp.size());
-
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-    for (size_t i = 0; i < embd_inp.size(); i++) {
-        printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
-    }
-    printf("\n");
-
-    std::vector<gpt_vocab::id> embd;
-
-    dollyv2_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, mem_per_token);
-
-    const int32_t end_token = vocab.token_to_id["### End"];
-
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!dollyv2_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
-                printf("Failed to predict\n");
-                return output;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp = params.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (int32_t(embd.size()) > params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            output += vocab.id_to_token[id];
-            if (stream_response_to_cout) {
-                printf("%s", vocab.id_to_token[id].c_str());
-            }
-        }
-        if (stream_response_to_cout) {
-            fflush(stdout);
-        }
-
-        // end of text token
-        if (embd.back() == 0 || (end_token > 0 && embd.back() == end_token)) {
-            return output;
-        }
-    }
-    return output;
-}
-
-#if defined(DOLLY_INTERACTIVE_PORT)
-int setup_port(const int port) {
-    int sockfd = socket(AF_INET, SOCK_STREAM, 0);
-    if (sockfd < 0) {
-        fprintf(stderr, "%s: Failed to create new socket\n", __func__);
-        return -1;
-    }
-
-    sockaddr_in servaddr;
-    std::memset(&servaddr, 0, sizeof(servaddr));
-
-    servaddr.sin_family = AF_INET;
-    servaddr.sin_addr.s_addr = htonl(INADDR_ANY);
-    servaddr.sin_port = htons(port);
-
-    if (bind(sockfd, (struct sockaddr *)&servaddr, sizeof(servaddr)) < 0) {
-        fprintf(stderr, "%s: Failed to bind to port %i\n", __func__, port);
-        return -1;
-    }
-
-    if (listen(sockfd, 10) < 0) {
-        fprintf(stderr, "%s: Failed to listen to socket on port %i\n", __func__, port);
-        return -1;
-    }
-    return sockfd;
-}
-
-std::string read_from_port(int sockfd, int clientfd) {
-    if (clientfd < 0) {
-        fprintf(stderr, "%s: Failed to accept new connection\n", __func__);
-        return "";
-    }
-
-    char buffer[4096];
-    std::memset(buffer, 0, sizeof(buffer));
-
-    if (read(clientfd, buffer, sizeof(buffer)) < 0) {
-        fprintf(stderr, "%s: Failed to read from client\n", __func__);
-    } else {
-        std::cout << "Received: " << buffer;
-        return std::string(buffer);
-    }
-    return std::string("");
-}
-#endif
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-    params.model = "models/dolly-v2-3b/ggml-model-f16.bin";
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-
-    int64_t t_load_us = 0;
-    int64_t t_sample_us = 0;
-    int64_t t_predict_us = 0;
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-
-    int n_past = 0;
-
-    gpt_vocab vocab;
-    dollyv2_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!dollyv2_model_load(params.model, model, vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-        test_gpt_tokenizer(vocab, params.token_test);
-    }
-
-#if defined(DOLLY_INTERACTIVE_PORT)
-    int sockfd = -1;
-    if (params.interactive_port != -1) {
-        sockfd = setup_port(params.interactive_port);
-        if (sockfd == -1) {
-            return 1;
-        }
-        fprintf(stdout, "Model is ready on port %i\n", params.interactive_port);
-        fflush(stdout);
-    }
-#endif
-
-    if (params.interactive || params.interactive_port != -1) {
-        while (true) {
-            std::string prompt_input;
-#if defined(DOLLY_INTERACTIVE_PORT)
-            int clientfd = -1;
-            if (params.interactive_port != -1) {
-                sockaddr_in clientaddr;
-                socklen_t clientaddrlen = sizeof(clientaddr);
-                clientfd = accept(sockfd, (struct sockaddr *)&clientaddr, &clientaddrlen);
-                prompt_input = read_from_port(sockfd, clientfd);
-            } else
-#endif
-            {
-                printf("Please enter your quesiton:\n>");
-                fflush(stdout);
-
-                std::getline(std::cin, prompt_input);
-            }
-
-            if (strcmp(prompt_input.c_str(), "exit") == 0) {
-                break;
-            }
-
-            const std::string prompt = prompt_for_generation(prompt_input);
-            // call the model
-            const std::string response = execute_prompt(model, vocab, prompt, params, rng, t_load_us, t_sample_us, t_predict_us, mem_per_token, n_past, true);
-
-#if defined(DOLLY_INTERACTIVE_PORT)
-            if (params.interactive_port != -1) {
-                if (write(clientfd, response.c_str(), response.size()) < 0) {
-                    fprintf(stderr, "%s: Failed to write answer '%s' to client\n", __func__, response.c_str());
-                }
-
-                if (close(clientfd) < 0) {
-                    fprintf(stderr, "%s: Failed to close client socket\n", __func__);
-                }
-            } else
-#endif
-            {
-                printf("%s\n\n", response.c_str());
-            }
-            fflush(stdout);
-        }
-    } else {
-        if (params.prompt.empty()) {
-            params.prompt = gpt_random_prompt(rng);
-        }
-
-        const std::string prompt = prompt_for_generation(params.prompt);
-        execute_prompt(model, vocab, prompt, params, rng, t_load_us, t_sample_us, t_predict_us, mem_per_token, n_past, true);
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us / 1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us / 1000.0f, t_predict_us / 1000.0f / n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-#if defined(DOLLY_INTERACTIVE_PORT)
-    if (params.interactive_port != -1 && close(sockfd) < 0) {
-        fprintf(stderr, "%s: Failed to close server socket\n", __func__);
-    }
-#endif
-
-    return 0;
-}

+ 0 - 178
ggml/examples/dolly-v2/quantize.cpp

@@ -1,178 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include <regex>
-
-// default hparams (dollyv2 3B)
-struct dollyv2_hparams {
-    int32_t n_vocab = 50254; // tokenizer.vocab_size
-    int32_t n_ctx   = 2048;  // model.config.max_position_embeddings
-    int32_t n_embd  = 2560;  // model.config.hidden_size
-    int32_t n_head  = 32;    // model.config.num_attention_heads
-    int32_t n_layer = 32;    // model.config.num_hidden_layers
-    int32_t n_rot   = 20;    // rotary_pct[25%] * (n_embd / n_head)
-    int32_t par_res = 1; // 1 = true, 0 = false
-    int32_t ftype   = GGML_FTYPE_MOSTLY_F16;
-};
-
-// quantize a model
-bool dollyv2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
-    gpt_vocab vocab;
-
-    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
-
-    auto finp = std::ifstream(fname_inp, std::ios::binary);
-    if (!finp) {
-        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    auto fout = std::ofstream(fname_out, std::ios::binary);
-    if (!fout) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        finp.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
-            return false;
-        }
-
-        fout.write((char *) &magic, sizeof(magic));
-    }
-
-    dollyv2_hparams hparams;
-
-    // load hparams
-    {
-        finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        finp.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        finp.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        finp.read((char *) &hparams.par_res, sizeof(hparams.par_res));
-        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
-        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
-
-        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
-        printf("%s: par_res     = %d\n", __func__, hparams.par_res);
-        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
-        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
-        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
-
-        fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fout.write((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fout.write((char *) &hparams.par_res, sizeof(hparams.par_res));
-        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
-    }
-
-    // load vocab
-    {
-        const int32_t n_vocab = hparams.n_vocab;
-
-        std::string word;
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            finp.read ((char *) &len, sizeof(len));
-            fout.write((char *) &len, sizeof(len));
-
-            word.resize(len);
-            finp.read ((char *) word.data(), len);
-            fout.write((char *) word.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // regexes of tensor names to be quantized
-    const std::vector<std::string> to_quant = {
-        ".*weight",
-    };
-
-    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
-        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    finp.close();
-    fout.close();
-
-    return true;
-}
-
-// usage:
-//  ./dollyv2-quantize models/dolly-v2-3B/ggml-model.bin models/dolly-v2-3B/ggml-model-quant.bin type
-//
-int main(int argc, char ** argv) {
-    if (argc != 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        ggml_print_ftypes(stderr);
-        return 1;
-    }
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
-
-    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    int64_t t_quantize_us = 0;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!dollyv2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
-            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
-            return 1;
-        }
-
-        t_quantize_us = ggml_time_us() - t_start_us;
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n");
-        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    return 0;
-}

+ 0 - 13
ggml/examples/gpt-2/CMakeLists.txt

@@ -1,13 +0,0 @@
-#
-# gpt-2
-
-set(TEST_TARGET gpt-2)
-add_executable(${TEST_TARGET} main.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
-#
-# gpt-2-quantize
-
-set(TEST_TARGET gpt-2-quantize)
-add_executable(${TEST_TARGET} quantize.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)

+ 0 - 158
ggml/examples/gpt-2/README.md

@@ -1,158 +0,0 @@
-# gpt-2
-
-This is a C++ example running GPT-2 inference using the [ggml](https://github.com/ggerganov/ggml) library.
-
-The program runs on the CPU - no video card is required.
-
-The [Cerebras-GPT](https://huggingface.co/cerebras) models are also supported.
-
-The example supports the following GPT-2 models:
-
-| Model | Description  | Disk Size |
-| ---   | ---          | ---       |
-| 117M  | Small model  | 240 MB    |
-| 345M  | Medium model | 680 MB    |
-| 774M  | Large model  | 1.5 GB    |
-| 1558M | XL model     | 3.0 GB    |
-
-Sample performance on MacBook M1 Pro:
-
-| Model | Size  | Time / Token |
-| ---   | ---   | ---    |
-| GPT-2 |  117M |   5 ms |
-| GPT-2 |  345M |  12 ms |
-| GPT-2 |  774M |  23 ms |
-| GPT-2 | 1558M |  42 ms |
-
-*TODO: add tables for Cerebras-GPT models*
-
-Sample output:
-
-```
-$ ./bin/gpt-2 -h
-usage: ./bin/gpt-2 [options]
-
-options:
-  -h, --help            show this help message and exit
-  -s SEED, --seed SEED  RNG seed (default: -1)
-  -t N, --threads N     number of threads to use during computation (default: 8)
-  -p PROMPT, --prompt PROMPT
-                        prompt to start generation with (default: random)
-  -n N, --n_predict N   number of tokens to predict (default: 200)
-  --top_k N             top-k sampling (default: 40)
-  --top_p N             top-p sampling (default: 0.9)
-  --temp N              temperature (default: 1.0)
-  -b N, --batch_size N  batch size for prompt processing (default: 8)
-  -m FNAME, --model FNAME
-                        model path (default: models/gpt-2-117M/ggml-model.bin)
-
-$ ./bin/gpt-2
-gpt2_model_load: loading model from 'models/gpt-2-117M/ggml-model.bin'
-gpt2_model_load: n_vocab = 50257
-gpt2_model_load: n_ctx   = 1024
-gpt2_model_load: n_embd  = 768
-gpt2_model_load: n_head  = 12
-gpt2_model_load: n_layer = 12
-gpt2_model_load: f16     = 1
-gpt2_model_load: ggml ctx size = 311.12 MB
-gpt2_model_load: memory size =    72.00 MB, n_mem = 12288
-gpt2_model_load: model size  =   239.08 MB
-main: number of tokens in prompt = 1
-
-So this is going to be the end of the line for us.
-
-If the Dolphins continue to do their business, it's possible that the team could make a bid to bring in new defensive coordinator Scott Linehan.
-
-Linehan's job is a little daunting, but he's a great coach and an excellent coach. I don't believe we're going to make the playoffs.
-
-We're going to have to work hard to keep our heads down and get ready to go.<|endoftext|>
-
-main: mem per token =  2048612 bytes
-main:     load time =   106.32 ms
-main:   sample time =     7.10 ms
-main:  predict time =   506.40 ms / 5.06 ms per token
-main:    total time =   629.84 ms
-```
-
-## Downloading and converting the original models (GPT-2)
-
-You can download the original model files using the [download-model.sh](download-model.sh) Bash script. The models are
-in Tensorflow format, so in order to use them with ggml, you need to convert them to appropriate format. This is done
-via the [convert-ckpt-to-ggml.py](convert-ckpt-to-ggml.py) python script.
-
-Here is the entire process for the GPT-2 117M model (download from official site + conversion):
-
-```
-cd ggml/build
-../examples/gpt-2/download-model.sh 117M
-
-Downloading model 117M ...
-models/gpt-2-117M/checkpoint                      100%[=============================>]      77  --.-KB/s    in 0s
-models/gpt-2-117M/encoder.json                    100%[=============================>]   1018K  1.20MB/s    in 0.8s
-models/gpt-2-117M/hparams.json                    100%[=============================>]      90  --.-KB/s    in 0s
-models/gpt-2-117M/model.ckpt.data-00000-of-00001  100%[=============================>] 474.70M  1.21MB/s    in 8m 39s
-models/gpt-2-117M/model.ckpt.index                100%[=============================>]   5.09K  --.-KB/s    in 0s
-models/gpt-2-117M/model.ckpt.meta                 100%[=============================>] 460.11K   806KB/s    in 0.6s
-models/gpt-2-117M/vocab.bpe                       100%[=============================>] 445.62K   799KB/s    in 0.6s
-Done! Model '117M' saved in 'models/gpt-2-117M/'
-
-Run the convert-ckpt-to-ggml.py script to convert the model to ggml format.
-
-  python /Users/john/ggml/examples/gpt-2/convert-ckpt-to-ggml.py models/gpt-2-117M/ 1
-
-```
-
-This conversion requires that you have python and Tensorflow installed on your computer. Still, if you want to avoid
-this, you can download the already converted ggml models as described below.
-
-## Downloading and converting the original models (Cerebras-GPT)
-
-Clone the respective repository from here: https://huggingface.co/cerebras
-
-Use the [convert-cerebras-to-ggml.py](convert-cerebras-to-ggml.py) script to convert the model to `ggml` format:
-
-```
-cd ggml/build
-git clone https://huggingface.co/cerebras/Cerebras-GPT-111M models/
-python ../examples/gpt-2/convert-cerebras-to-ggml.py models/Cerebras-GPT-111M/
-
-```
-
-## Downloading the ggml model directly (GPT-2)
-
-For convenience, I will be hosting the converted ggml model files in order to make it easier to run the examples. This
-way, you can directly download a single binary file and start using it. No python or Tensorflow is required.
-
-Here is how to get the 117M ggml model:
-
-```
-cd ggml/build
-../examples/gpt-2/download-ggml-model.sh 117M
-
-Downloading ggml model 117M ...
-models/gpt-2-117M/ggml-model.bin         100%[===============================>] 239.58M  8.52MB/s    in 28s
-Done! Model '117M' saved in 'models/gpt-2-117M/ggml-model.bin'
-You can now use it like this:
-
-  $ ./bin/gpt-2 -m models/gpt-2-117M/ggml-model.bin -p "This is an example"
-
-```
-
-At some point, I might decide to stop hosting these models. So in that case, simply revert to the manual process above.
-
-## Quantizing the models
-
-You can also try to quantize the `ggml` models via 4-bit integer quantization.
-Keep in mind that for smaller models, this will render them completely useless.
-You generally want to quantize larger models.
-
-```
-# quantize GPT-2 F16 to Q4_0 (faster but less precise)
-./bin/gpt-2-quantize models/gpt-2-1558M/ggml-model-f16.bin models/gpt-2-1558M/ggml-model-q4_0.bin 2
-./bin/gpt-2 -m models/gpt-2-1558M/ggml-model-q4_0.bin -p "This is an example"
-
-# quantize Cerebras F16 to Q4_1 (slower but more precise)
-./bin/gpt-2-quantize models/Cerebras-GPT-6.7B/ggml-model-f16.bin models/Cerebras-GPT-6.7B/ggml-model-q4_1.bin 3
-./bin/gpt-2 -m models/Cerebras-GPT-6.7B/ggml-model-q4_1.bin -p "This is an example"
-
-```

+ 0 - 183
ggml/examples/gpt-2/convert-cerebras-to-ggml.py

@@ -1,183 +0,0 @@
-# Convert Cerebras models to ggml format
-#
-# ref: https://www.cerebras.net/blog/cerebras-gpt-a-family-of-open-compute-efficient-large-language-models/
-#
-
-import sys
-import struct
-import json
-import torch
-import numpy as np
-import re
-
-from transformers import AutoModelForCausalLM
-
-# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-if len(sys.argv) < 2:
-    print("Usage: convert-cerebras-to-ggml.py dir-model [use-f32]\n")
-    sys.exit(1)
-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-fname_out = sys.argv[1] + "/ggml-model-f16.bin"
-
-with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
-    encoder = json.load(f)
-
-with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-# use 16-bit or 32-bit floats
-use_f16 = True
-if len(sys.argv) > 2:
-    use_f16 = False
-    fname_out = sys.argv[1] + "/ggml-model-f32.bin"
-
-model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
-#print (model)
-
-list_vars = model.state_dict()
-#print (list_vars)
-
-print(hparams)
-
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-fout.write(struct.pack("i", hparams["vocab_size"]))
-fout.write(struct.pack("i", hparams["n_positions"]))
-fout.write(struct.pack("i", hparams["n_embd"]))
-fout.write(struct.pack("i", hparams["n_head"]))
-fout.write(struct.pack("i", hparams["n_layer"]))
-fout.write(struct.pack("i", use_f16))
-
-byte_encoder = bytes_to_unicode()
-byte_decoder = {v:k for k, v in byte_encoder.items()}
-
-fout.write(struct.pack("i", len(encoder)))
-
-for key in encoder:
-    text = bytearray([byte_decoder[c] for c in key])
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-
-for name in list_vars.keys():
-    data = list_vars[name].squeeze().numpy()
-    print("Processing variable: " + name + " with shape: ", data.shape)
-
-    # rename headers to keep compatibility
-    if name == "transformer.ln_f.weight":
-        name = "model/ln_f/g"
-    elif name == "transformer.ln_f.bias":
-        name = "model/ln_f/b"
-    elif name == "transformer.wte.weight":
-        name = "model/wte"
-    elif name == "transformer.wpe.weight":
-        name = "model/wpe"
-    elif name == "lm_head.weight":
-        name = "model/lm_head"
-    elif re.match(r"transformer.h\.\d+\.ln_1\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_1/g"
-    elif re.match(r"transformer.h\.\d+\.ln_1\.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_1/b"
-    elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_attn/w"
-    elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_attn/b"
-    elif re.match(r"transformer.h\.\d+\.attn\.c_proj\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_proj/w"
-    elif re.match(r"transformer.h.\d+.attn.c_proj.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_proj/b"
-    elif re.match(r"transformer.h.\d+.ln_2.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_2/g"
-    elif re.match(r"transformer.h.\d+.ln_2.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_2/b"
-    elif re.match(r"transformer.h.\d+.mlp.c_fc.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_fc/w"
-    elif re.match(r"transformer.h.\d+.mlp.c_fc.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_fc/b"
-    elif re.match(r"transformer.h.\d+.mlp.c_proj.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_proj/w"
-    elif re.match(r"transformer.h.\d+.mlp.c_proj.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_proj/b"
-    else:
-        print("Unrecognized variable name. %s", name)
-
-    # we don't need these
-    if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
-        print("  Skipping variable: " + name)
-        continue
-
-    n_dims = len(data.shape);
-
-    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype = 0;
-    if use_f16:
-        if (name == "model/wte" or name == "model/lm_head" or name[-2:] == "/g" or name[-2:] == "/w") and n_dims == 2:
-            print("  Converting to float16")
-            data = data.astype(np.float16)
-            ftype = 1
-        else:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype = 0
-
-    # for efficiency - transpose the projection matrices
-    # "model/h.*/attn/c_attn/w"
-    # "model/h.*/attn/c_proj/w"
-    # "model/h.*/mlp/c_fc/w"
-    # "model/h.*/mlp/c_proj/w"
-    if name[-14:] == "/attn/c_attn/w" or \
-       name[-14:] == "/attn/c_proj/w" or \
-       name[-11:] == "/mlp/c_fc/w" or \
-       name[-13:] == "/mlp/c_proj/w":
-        print("  Transposing")
-        data = data.transpose()
-
-    # header
-    str = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    fout.write(str);
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")

+ 0 - 159
ggml/examples/gpt-2/convert-ckpt-to-ggml.py

@@ -1,159 +0,0 @@
-# Convert a model checkpoint to a ggml compatible file
-#
-# Load the model using TensorFlow.
-# Iterate over all variables and write them to a binary file.
-#
-# For each variable, write the following:
-#   - Number of dimensions (int)
-#   - Name length (int)
-#   - Dimensions (int[n_dims])
-#   - Name (char[name_length])
-#   - Data (float[n_dims])
-#
-# By default, the bigger matrices are converted to 16-bit floats.
-# This can be disabled by adding the "use-f32" CLI argument.
-#
-# At the start of the ggml file we write the model parameters
-# and vocabulary.
-#
-
-import sys
-import json
-import struct
-import numpy as np
-import tensorflow as tf
-
-# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-# helper method to convert a numpy array to different float types
-def convert_to_ftype(data, ftype):
-    # fp16
-    if ftype == 1:
-        return data.astype(np.float16)
-
-    assert False, "Invalid ftype: " + str(ftype)
-
-if len(sys.argv) < 3:
-    print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n")
-    print("  ftype == 0 -> float32")
-    print("  ftype == 1 -> float16")
-    sys.exit(1)
-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-fname_out = sys.argv[1] + "/ggml-model.bin"
-
-with open(dir_model + "/encoder.json", "r", encoding="utf-8") as f:
-    encoder = json.load(f)
-
-with open(dir_model + "/hparams.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-# possible data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-#
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-ftype = 1
-if len(sys.argv) > 2:
-    ftype = int(sys.argv[2])
-    if ftype < 0 or ftype > 1:
-        print("Invalid ftype: " + str(ftype))
-        sys.exit(1)
-    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
-
-list_vars = tf.train.list_variables(dir_model)
-
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-fout.write(struct.pack("i", hparams["n_vocab"]))
-fout.write(struct.pack("i", hparams["n_ctx"]))
-fout.write(struct.pack("i", hparams["n_embd"]))
-fout.write(struct.pack("i", hparams["n_head"]))
-fout.write(struct.pack("i", hparams["n_layer"]))
-fout.write(struct.pack("i", ftype))
-
-byte_encoder = bytes_to_unicode()
-byte_decoder = {v:k for k, v in byte_encoder.items()}
-
-fout.write(struct.pack("i", len(encoder)))
-
-for key in encoder:
-    text = bytearray([byte_decoder[c] for c in key])
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-
-for name, shape in list_vars:
-    print("Processing variable: " + name + " with shape: ", shape)
-
-    data = tf.train.load_variable(dir_model, name).squeeze()
-    n_dims = len(data.shape);
-
-    # for efficiency - transpose the projection matrices
-    # "model/h.*/attn/c_attn/w"
-    # "model/h.*/attn/c_proj/w"
-    # "model/h.*/mlp/c_fc/w"
-    # "model/h.*/mlp/c_proj/w"
-    if name[-14:] == "/attn/c_attn/w" or \
-       name[-14:] == "/attn/c_proj/w" or \
-       name[-11:] == "/mlp/c_fc/w" or \
-       name[-13:] == "/mlp/c_proj/w":
-        print("  Transposing")
-        data = data.transpose()
-
-    dshape = data.shape
-
-    ftype_cur = 0
-    if ftype != 0:
-        # match name:
-        #  "model/wte"
-        #  "model/h.*/attn/c_attn/w"
-        #  "model/h.*/attn/c_proj/w"
-        #  "model/h.*/mlp/c_fc/w"
-        #  "model/h.*/mlp/c_proj/w"
-        if name == "model/wte" or name[-2:] == "/w":
-            print("  Converting to " + ftype_str[ftype])
-            data = convert_to_ftype(data, ftype)
-            ftype_cur = ftype
-        else:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-
-    # header
-    str = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
-    fout.write(str);
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")

+ 0 - 195
ggml/examples/gpt-2/convert-h5-to-ggml.py

@@ -1,195 +0,0 @@
-# Convert GPT-2 h5 transformer model to ggml format
-#
-# Load the model using GPT2Model.
-# Iterate over all variables and write them to a binary file.
-#
-# For each variable, write the following:
-#   - Number of dimensions (int)
-#   - Name length (int)
-#   - Dimensions (int[n_dims])
-#   - Name (char[name_length])
-#   - Data (float[n_dims])
-#
-# By default, the bigger matrices are converted to 16-bit floats.
-# This can be disabled by adding the "use-f32" CLI argument.
-#
-# At the start of the ggml file we write the model parameters
-# and vocabulary.
-#
-
-import sys
-import struct
-import json
-import numpy as np
-import re
-
-from transformers import GPT2Model
-
-# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-if len(sys.argv) < 2:
-    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
-    sys.exit(1)
-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-fname_out = sys.argv[1] + "/ggml-model.bin"
-
-with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
-    encoder = json.load(f)
-
-with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
-    encoder_added = json.load(f)
-
-with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-# use 16-bit or 32-bit floats
-use_f16 = True
-if len(sys.argv) > 2:
-    use_f16 = False
-    fname_out = sys.argv[1] + "/ggml-model-f32.bin"
-
-model = GPT2Model.from_pretrained(dir_model, low_cpu_mem_usage=True)
-#print (model)
-
-list_vars = model.state_dict()
-#print (list_vars)
-
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-fout.write(struct.pack("i", hparams["vocab_size"]))
-fout.write(struct.pack("i", hparams["n_positions"]))
-fout.write(struct.pack("i", hparams["n_embd"]))
-fout.write(struct.pack("i", hparams["n_head"]))
-fout.write(struct.pack("i", hparams["n_layer"]))
-#fout.write(struct.pack("i", hparams["rotary_dim"]))
-fout.write(struct.pack("i", use_f16))
-
-byte_encoder = bytes_to_unicode()
-byte_decoder = {v:k for k, v in byte_encoder.items()}
-
-fout.write(struct.pack("i", len(encoder) + len(encoder_added)))
-
-for key in encoder:
-    text = bytearray([byte_decoder[c] for c in key])
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-
-for key in encoder_added:
-    text = bytearray([byte_decoder[c] for c in key])
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-
-for name in list_vars.keys():
-    data = list_vars[name].squeeze().numpy()
-    print("Processing variable: " + name + " with shape: ", data.shape)
-
-    # we don't need these
-    if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
-        print("  Skipping variable: " + name)
-        continue
-
-    n_dims = len(data.shape);
-
-    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype = 0;
-    if use_f16:
-        if name[-7:] == ".weight" and n_dims == 2:
-            print("  Converting to float16")
-            data = data.astype(np.float16)
-            ftype = 1
-        else:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype = 0
-
-    # for efficiency - transpose these matrices:
-    #  "transformer.h.*.mlp.c_proj.weight
-    if name.endswith(".mlp.c_proj.weight"):
-        print("  Transposing")
-        data = data.transpose()
-
-    # rename headers to keep compatibility
-    if name == "ln_f.weight":
-        name = "model/ln_f/g"
-    elif name == "ln_f.bias":
-        name = "model/ln_f/b"
-    elif name == "wte.weight":
-        name = "model/wte"
-    elif name == "wpe.weight":
-        name = "model/wpe"
-    elif re.match(r"h\.\d+\.ln_1\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_1/g"
-    elif re.match(r"h\.\d+\.ln_1\.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_1/b"
-    elif re.match(r"h\.\d+\.attn\.c_attn\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_attn/w"
-    elif re.match(r"h\.\d+\.attn\.c_attn\.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_attn/b"
-    elif re.match(r"h\.\d+\.attn\.c_proj\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_proj/w"
-    elif re.match(r"h.\d+.attn.c_proj.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_proj/b"
-    elif re.match(r"h.\d+.ln_2.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_2/g"
-    elif re.match(r"h.\d+.ln_2.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_2/b"
-    elif re.match(r"h.\d+.mlp.c_fc.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_fc/w"
-    elif re.match(r"h.\d+.mlp.c_fc.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_fc/b"
-    elif re.match(r"h.\d+.mlp.c_proj.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_proj/w"
-    elif re.match(r"h.\d+.mlp.c_proj.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_proj/b"
-    else:
-        print("Unrecognized variable name. %s", name)
-
-    str = name.encode('utf-8')
-
-    fout.write(struct.pack("iii", n_dims, len(str), ftype))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    fout.write(str);
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")

+ 0 - 69
ggml/examples/gpt-2/download-ggml-model.sh

@@ -1,69 +0,0 @@
-#!/bin/bash
-
-# This script downloads GPT-2 model files that have already been converted to ggml format.
-# This way you don't have to convert them yourself.
-#
-# If you want to download the original GPT-2 model files, use the "download-model.sh" script instead.
-
-#src="https://ggml.ggerganov.com"
-#pfx="ggml-model-gpt-2"
-
-src="https://huggingface.co/ggerganov/ggml"
-pfx="resolve/main/ggml-model-gpt-2"
-
-ggml_path=$(dirname $(realpath $0))
-
-# GPT-2 models
-models=( "117M" "345M" "774M" "1558M" )
-
-# list available models
-function list_models {
-    printf "\n"
-    printf "  Available models:"
-    for model in "${models[@]}"; do
-        printf " $model"
-    done
-    printf "\n\n"
-}
-
-if [ "$#" -ne 1 ]; then
-    printf "Usage: $0 <model>\n"
-    list_models
-
-    exit 1
-fi
-
-model=$1
-
-if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
-    printf "Invalid model: $model\n"
-    list_models
-
-    exit 1
-fi
-
-# download ggml model
-
-printf "Downloading ggml model $model ...\n"
-
-mkdir -p models/gpt-2-$model
-
-if [ -x "$(command -v wget)" ]; then
-    wget --quiet --show-progress -O models/gpt-2-$model/ggml-model.bin $src/$pfx-$model.bin
-elif [ -x "$(command -v curl)" ]; then
-    curl -L --output models/gpt-2-$model/ggml-model.bin $src/$pfx-$model.bin
-else
-    printf "Either wget or curl is required to download models.\n"
-    exit 1
-fi
-
-if [ $? -ne 0 ]; then
-    printf "Failed to download ggml model $model \n"
-    printf "Please try again later or download the original GPT-2 model files and convert them yourself.\n"
-    exit 1
-fi
-
-printf "Done! Model '$model' saved in 'models/gpt-2-$model/ggml-model.bin'\n"
-printf "You can now use it like this:\n\n"
-printf "  $ ./bin/gpt-2 -m models/gpt-2-$model/ggml-model.bin -p \"This is an example\"\n"
-printf "\n"

+ 0 - 48
ggml/examples/gpt-2/download-model.sh

@@ -1,48 +0,0 @@
-#!/bin/bash
-
-ggml_path=$(dirname $(realpath $0))
-
-# GPT-2 models
-models=( "117M" "345M" "774M" "1558M" )
-
-# list available models
-function list_models {
-    printf "\n"
-    printf "  Available models:"
-    for model in "${models[@]}"; do
-        printf " $model"
-    done
-    printf "\n\n"
-}
-
-if [ "$#" -ne 1 ]; then
-    printf "Usage: $0 <model>\n"
-    list_models
-
-    exit 1
-fi
-
-model=$1
-
-if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
-    printf "Invalid model: $model\n"
-    list_models
-
-    exit 1
-fi
-
-# download model
-
-printf "Downloading model $model ...\n"
-
-mkdir -p models/gpt-2-$model
-
-for file in checkpoint encoder.json hparams.json model.ckpt.data-00000-of-00001 model.ckpt.index model.ckpt.meta vocab.bpe; do
-    wget --quiet --show-progress -O models/gpt-2-$model/$file https://openaipublic.blob.core.windows.net/gpt-2/models/$model/$file
-done
-
-printf "Done! Model '$model' saved in 'models/gpt-2-$model/'\n\n"
-printf "Run the convert-ckpt-to-ggml.py script to convert the model to ggml format.\n"
-printf "\n"
-printf "  python $ggml_path/convert-ckpt-to-ggml.py models/gpt-2-$model/\n"
-printf "\n"

+ 0 - 892
ggml/examples/gpt-2/main.cpp

@@ -1,892 +0,0 @@
-#include "ggml/ggml.h"
-#include "ggml/ggml-alloc.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// default hparams (GPT-2 117M)
-struct gpt2_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 1024;
-    int32_t n_embd  = 768;
-    int32_t n_head  = 12;
-    int32_t n_layer = 12;
-    int32_t ftype   = 1;
-    float   eps     = 1e-5f;
-};
-
-struct gpt2_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // mlp
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct gpt2_model {
-    gpt2_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte;     // position embedding
-    struct ggml_tensor * wpe;     //    token embedding
-    struct ggml_tensor * lm_head; // language model head
-
-    std::vector<gpt2_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-// load the model's weights from a file
-bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
-    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr   = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        fin.read((char *) &n_vocab, sizeof(n_vocab));
-
-        if (n_vocab != model.hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
-            return false;
-        }
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *) buf.data(), len);
-            word.assign(buf.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
-
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
-
-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
-
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
-
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
-
-        ctx_size += (6 + 12*n_layer)*512; // object overhead
-
-        printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
-        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-
-        // map by name
-        model.tensors["model/ln_f/g"] = model.ln_f_g;
-        model.tensors["model/ln_f/b"] = model.ln_f_b;
-
-        model.tensors["model/wte"]     = model.wte;
-        model.tensors["model/wpe"]     = model.wpe;
-        model.tensors["model/lm_head"] = model.lm_head;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
-            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
-
-            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
-            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
-            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            // map by name
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/b"]        = layer.ln_1_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/g"]        = layer.ln_2_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/b"]        = layer.ln_2_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int n_mem      = n_layer*n_ctx;
-        const int n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-
-    // load weights
-    {
-        size_t total_size = 0;
-
-        bool has_lm_head = false;
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            auto tensor = model.tensors[name];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-            // GPT-2 models share the WTE tensor as the LM head
-            if (name == "model/wte" && has_lm_head == false) {
-                memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
-            }
-
-            if (name == "model/lm_head") {
-                has_lm_head = true;
-            }
-
-            total_size += ggml_nbytes(tensor);
-        }
-
-        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// build the computation graph
-struct ggml_cgraph * gpt2_graph(
-        const gpt2_model & model,
-        struct ggml_allocr * allocr,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-
-    // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
-    static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead();
-    static std::vector<uint8_t> buf(buf_size);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf.data(),
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    ggml_allocr_alloc(allocr, embd);
-
-    // avoid writing to tensors if we are only measuring the memory usage
-    if (!ggml_allocr_is_measure(allocr)) {
-        memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-    }
-
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    ggml_allocr_alloc(allocr, position);
-    if (!ggml_allocr_is_measure(allocr)) {
-        for (int i = 0; i < N; ++i) {
-            ((int32_t *) position->data)[i] = n_past + i;
-        }
-    }
-
-    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-    ggml_allocr_alloc(allocr, KQ_scale);
-    if (!ggml_allocr_is_measure(allocr)) {
-        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
-    }
-
-    // wte + wpe
-    struct ggml_tensor * inpL =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.wte, embd),
-                ggml_get_rows(ctx0, model.wpe, position));
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        // norm
-        {
-            // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
-
-            // cur = ln_1_g*cur + ln_1_b
-            // [ 768, N]
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
-                        cur),
-                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
-        }
-
-        // attn
-        // [2304, 768] - model.layers[il].c_attn_attn_w
-        // [2304,   1] - model.layers[il].c_attn_attn_b
-        // [ 768,   N] - cur (in)
-        // [2304,   N] - cur (out)
-        //
-        // cur = attn_w*cur + attn_b
-        // [2304, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_attn_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
-                    cur);
-        }
-
-        // self-attention
-        {
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
-
-            // store key and value to memory
-            if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            // [64, N, 12]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
-                        0, 2, 1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            // [64, n_past + N, 12]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            // GG: flash attention
-            //struct ggml_tensor * V =
-            //    ggml_cpy(ctx0,
-            //            ggml_permute(ctx0,
-            //                ggml_reshape_3d(ctx0,
-            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-            //                    n_embd/n_head, n_head, n_past + N),
-            //                1, 2, 0, 3),
-            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
-
-            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
-
-            // K * Q
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
-                        KQ,
-                        KQ_scale);
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            // [n_past + N, 64, 12]
-            struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-                                n_embd/n_head, n_head, n_past + N),
-                            1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
-
-            // KQV = transpose(V) * KQ_soft_max
-            // [64, N, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // [64, 12, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // [768, N]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-        }
-
-        // projection
-        // [ 768, 768] - model.layers[il].c_attn_proj_w
-        // [ 768,   1] - model.layers[il].c_attn_proj_b
-        // [ 768,   N] - cur (in)
-        // [ 768,   N] - cur (out)
-        //
-        // cur = proj_w*cur + proj_b
-        // [768, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
-                    cur);
-        }
-
-        // add the input
-        cur = ggml_add(ctx0, cur, inpL);
-
-        struct ggml_tensor * inpFF = cur;
-
-        // feed-forward network
-        {
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
-
-                // cur = ln_2_g*cur + ln_2_b
-                // [ 768, N]
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
-                            cur),
-                        ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
-            }
-
-            // fully connected
-            // [3072, 768] - model.layers[il].c_mlp_fc_w
-            // [3072,   1] - model.layers[il].c_mlp_fc_b
-            // [ 768,   N] - cur (in)
-            // [3072,   N] - cur (out)
-            //
-            // cur = fc_w*cur + fc_b
-            // [3072, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_fc_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
-                    cur);
-
-            // GELU activation
-            // [3072, N]
-            cur = ggml_gelu(ctx0, cur);
-
-            // projection
-            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
-            // [ 768,    1] - model.layers[il].c_mlp_proj_b
-            // [3072,    N] - cur (in)
-            // [ 768,    N] - cur (out)
-            //
-            // cur = proj_w*cur + proj_b
-            // [768, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
-                    cur);
-        }
-
-        // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
-    }
-
-    // norm
-    {
-        // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        // [ 768, N]
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
-                    inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
-    }
-
-    // inpL = WTE * inpL
-    // [ 768, 50257] - model.lm_head
-    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
-
-    // logits -> probs
-    //inpL = ggml_soft_max(ctx0, inpL);
-
-    ggml_build_forward_expand(gf, inpL);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - allocr:    ggml_allocr to use to allocate the compute buffer
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool gpt2_eval(
-        const gpt2_model & model,
-        struct ggml_allocr * allocr,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_vocab = hparams.n_vocab;
-
-    // reset the allocator to free all the memory allocated during the previous inference
-    ggml_allocr_reset(allocr);
-
-    struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp);
-
-    // allocate tensors
-    ggml_allocr_alloc_graph(allocr, gf);
-
-    // run the computation
-    struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
-    static std::vector<uint8_t> work_buffer;
-    work_buffer.resize(plan.work_size);
-    plan.work_data = work_buffer.data();
-    ggml_graph_compute(gf, &plan);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    // in this case, the output tensor is the last one in the graph
-    struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // return result just for the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-    params.model = "models/gpt-2-117M/ggml-model.bin";
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    gpt2_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt2_model_load(params.model, model, vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-        test_gpt_tokenizer(vocab, params.token_test);
-    }
-
-    // keep this buffer alive while evaluating the model
-    std::vector<uint8_t> compute_buffer;
-
-    struct ggml_allocr * allocr = NULL;
-    // allocate the compute buffer
-    {
-        allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN);
-
-        // create the worst case graph for memory usage estimation
-        int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
-        int n_past = model.hparams.n_ctx - n_tokens;
-        struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
-
-        // compute the required memory
-        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN;
-
-        // recreate the allocator with the required memory
-        ggml_allocr_free(allocr);
-        compute_buffer.resize(mem_size);
-        allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN);
-
-        fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
-    }
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-    printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
-    for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
-        printf("%d ", embd_inp[i]);
-    }
-    printf("\n\n");
-
-    // submit the input prompt token-by-token
-    // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
-    std::vector<gpt_vocab::id> embd;
-
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int   top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp  = params.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (int32_t(embd.size()) >= params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str());
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (embd.back() == 50256) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}

+ 0 - 184
ggml/examples/gpt-2/quantize.cpp

@@ -1,184 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include <regex>
-
-// default hparams (GPT-2 117M)
-struct gpt2_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 1024;
-    int32_t n_embd  = 768;
-    int32_t n_head  = 12;
-    int32_t n_layer = 12;
-    int32_t ftype   = 1;
-};
-
-// quantize a model
-bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
-    gpt_vocab vocab;
-
-    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
-
-    auto finp = std::ifstream(fname_inp, std::ios::binary);
-    if (!finp) {
-        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    auto fout = std::ofstream(fname_out, std::ios::binary);
-    if (!fout) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        finp.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
-            return false;
-        }
-
-        fout.write((char *) &magic, sizeof(magic));
-    }
-
-    gpt2_hparams hparams;
-
-    // load hparams
-    {
-        finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        finp.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
-        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
-
-        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
-        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
-        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
-
-        fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        finp.read ((char *) &n_vocab, sizeof(n_vocab));
-        fout.write((char *) &n_vocab, sizeof(n_vocab));
-
-        if (n_vocab != hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
-            return false;
-        }
-
-        std::string word;
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            finp.read ((char *) &len, sizeof(len));
-            fout.write((char *) &len, sizeof(len));
-
-            word.resize(len);
-            finp.read ((char *) word.data(), len);
-            fout.write((char *) word.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // regexes of tensor names to be quantized
-    const std::vector<std::string> to_quant = {
-        "model/wte",
-        "model/lm_head",
-        "model/h.*/attn/c_attn/w",
-        "model/h.*/attn/c_proj/w",
-        "model/h.*/mlp/c_fc/w",
-        "model/h.*/mlp/c_proj/w",
-    };
-
-    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
-        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    finp.close();
-    fout.close();
-
-    return true;
-}
-
-// usage:
-//  ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type
-//
-int main(int argc, char ** argv) {
-    if (argc != 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        ggml_print_ftypes(stderr);
-        return 1;
-    }
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
-
-    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    int64_t t_quantize_us = 0;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
-            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
-            return 1;
-        }
-
-        t_quantize_us = ggml_time_us() - t_start_us;
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n");
-        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    return 0;
-}

+ 0 - 13
ggml/examples/gpt-j/CMakeLists.txt

@@ -1,13 +0,0 @@
-#
-# gpt-j
-
-set(TEST_TARGET gpt-j)
-add_executable(${TEST_TARGET} main.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
-#
-# gpt-j-quantize
-
-set(TEST_TARGET gpt-j-quantize)
-add_executable(${TEST_TARGET} quantize.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)

+ 0 - 246
ggml/examples/gpt-j/README.md

@@ -1,246 +0,0 @@
-# gpt-j
-
-Local GPT-J inference on your computer using C/C++
-
-No video card required. You just need to have 16 GB of RAM.
-
-## Motivation
-
-The GPT-J 6B model is the open-source alternative to OpenAI's GPT-3. It's basically a neural network that allows you to
-generate coherent, human-like text given a certain context (prompt).
-
-The GPT-J model is quite big - the compact version of the model uses 16-bit floating point representation of the weights
-and is still 12 GB big. This means that in order to run inference on your computer, you would need to have a video card
-with at least 12 GB of video RAM. Alternatively, you can try to run the python implementations on the CPU, but that
-would probably not be very efficient as they are primarily optimized for running on a GPU (or at least this is my guess -
-I don't have much experience with python).
-
-I wanted to try and run the model on my MacBook, so I decided to implement the model inference from scratch using my own
-custom build tensor library. The tensor library (called [ggml](https://github.com/ggerganov/ggml), written in C) is in
-early development stage, but it already allows me to run the GPT-J model.
-
-On my 32GB MacBook M1 Pro, I achieve an inference speed of about `125 ms/token` or about ~6 words per second (1 word
-typically consists of 1 or 2 tokens).
-
-Here is a sample run with prompt `int main(int argc, char ** argv) {`:
-
-```
-$ time ./bin/gpt-j -p "int main(int argc, char ** argv) {"
-
-gptj_model_load: loading model from 'models/gpt-j-6B/ggml-model.bin' - please wait ...
-gptj_model_load: n_vocab = 50400
-gptj_model_load: n_ctx   = 2048
-gptj_model_load: n_embd  = 4096
-gptj_model_load: n_head  = 16
-gptj_model_load: n_layer = 28
-gptj_model_load: n_rot   = 64
-gptj_model_load: f16     = 1
-gptj_model_load: ggml ctx size = 13334.86 MB
-gptj_model_load: memory_size =  1792.00 MB, n_mem = 57344
-gptj_model_load: ................................... done
-gptj_model_load: model size = 11542.79 MB / num tensors = 285
-main: number of tokens in prompt = 13
-
-int main(int argc, char ** argv) {
-    (void)argc;
-    (void)argv;
-
-    {
-        struct sockaddr_in addr;
-        int addrlen;
-        char * ip = "192.168.1.4";
-        int i;
-
-        if ( (addrlen = sizeof(addr)) == -1 )
-            return -1;
-
-        for (i = 0; i < 10; ++i) {
-            addr.sin_family = AF_INET;
-            addr.sin_addr.s_addr = inet_addr(ip);
-
-main: mem per token = 16430420 bytes
-main:     load time =  6211.48 ms
-main:   sample time =    13.74 ms
-main:  predict time = 26420.34 ms / 124.62 ms per token
-main:    total time = 33035.37 ms
-
-real	0m33.171s
-user	3m32.269s
-sys      0m3.686s
-
-$
-```
-
-It took ~6.2 seconds to load the model to memory. After that, it took ~26.4 seconds to generate 200 tokens of what
-looks like to be the beginning of a networking program in C. Pretty cool!
-
-Here is another run, just for fun:
-
-```
-time ./bin/gpt-j -n 500 -t 8 -p "Ask HN: Inherited the worst code and tech team I have ever seen. How to fix it?
-"
-
-gptj_model_load: loading model from 'models/gpt-j-6B/ggml-model.bin' - please wait ...
-gptj_model_load: n_vocab = 50400
-gptj_model_load: n_ctx   = 2048
-gptj_model_load: n_embd  = 4096
-gptj_model_load: n_head  = 16
-gptj_model_load: n_layer = 28
-gptj_model_load: n_rot   = 64
-gptj_model_load: f16     = 1
-gptj_model_load: ggml ctx size = 13334.86 MB
-gptj_model_load: memory_size =  1792.00 MB, n_mem = 57344
-gptj_model_load: ................................... done
-gptj_model_load: model size = 11542.79 MB / num tensors = 285
-main: number of tokens in prompt = 24
-
-Ask HN: Inherited the worst code and tech team I have ever seen. How to fix it?
-
-I've inherited a team with some very strange and un-documented practices, one of them is that they use an old custom
-application with a very slow tech stack written in Python that the team doesn't want to touch but also doesn't want to
-throw away as it has some "legacy" code in it.
-
-The problem is, the tech stack is very very slow.
-
-They have a single web server on a VM that is slow.
-The server is a little bit busy (not very busy though) and they have a lot of processes (30+ that are constantly being
-spawned by the application)
-They have an application that is single threaded and was written in Python and the team don't want to touch this, and
-the application is very slow.
-
-My task as a new member of the team is to fix this.
-
-I'm a senior dev on the team (3 years on the project) and have been told that I will take the lead on this task. I know
-next to nothing about Python. So here is what I have so far.
-
-What I have done is I've been trying to debug the processes with the "ps" command. This way I can see what is running
-and where. From what I see, the application spawns 10 processes a minute and some of them are used for nothing.
-
-I have also started to look for the code. The application source is not in GitHub or any other repository, it is only on
-our internal GitLab.
-
-What I've found so far:
-
-The application uses a custom SQLAlchemy implementation to interact with the data. I've looked at the source, it looks
-like an object cache or something like that. But from what I've seen, the cache gets full every 20 minutes and then gets
-cleared with a special command.
-
-Another strange thing is that the application creates a file for every entry in the database (even if the entry already
-exists). I've looked at the file to see if it contains something, but it seems to be a JSON file with lots of records.
-
-The other strange thing is that I can only find the database tables in the GitLab repository and not the code. So I
-can't really understand how the application is supposed to interact with the database.
-
-I also found a "log" directory, but the code is encrypted with AES. From what I've found, it is in
-
-main: mem per token = 16430420 bytes
-main:     load time =  3900.10 ms
-main:   sample time =    32.58 ms
-main:  predict time = 68049.91 ms / 130.11 ms per token
-main:    total time = 73020.05 ms
-
-real	1m13.156s
-user	9m1.328s
-sys.    0m7.103s
-```
-
-## Implementation details
-
-The high level implementation of the model is contained in the [main.cpp](main.cpp) file. The core computations are
-performed by the [ggml](https://github.com/ggerganov/ggml/blob/master/include/ggml/ggml.h) library.
-
-
-#### Matrix multiplication
-
-The most performance critical part of the implementation is of course the matrix multiplication routine. 99% of the time
-is spent here, so it was important to optimize this as much as possible.
-
-On Arm64, I utilize the 128-bit NEON intrinsics for 16-bit floating point operations:
-
-https://github.com/ggerganov/ggml/blob/fb558f78d905f85c54813602649ddd628ffe0f3a/src/ggml.c#L187-L243
-
-These instructions allow each core to operate simultaneously on 64 16-bit floats. I'm no expert in SIMD, but after quite
-some trials this was the most efficient code for dot product of a row and column that I could come up with. Combined
-with the parallel computation on 8 CPU threads, I believe I'm close to the maximum performance that one could possibly
-get on the M1 CPU. Still, I'm curious to know if there is a more efficient way to implement this.
-
-
-#### Attempt to use the M1 GPU
-
-One interesting property of the GPT-J transformer architecture is that it allows you to perform part of the inference in
-parallel - i.e. the Feed-forward network can be computed in parallel to the Self-attention layer:
-
-https://github.com/ggerganov/ggml/blob/fb558f78d905f85c54813602649ddd628ffe0f3a/examples/gpt-j/main.cpp#L507-L531
-
-So I thought why not try and bring in the M1 GPU to compute half of the neural network in parallel to the CPU and
-potentially gain some extra performance. Thanks to the M1's shared memory model, it was relatively easy to offload part
-of the computation to the GPU using Apple's [Metal Performance
-Shaders](https://developer.apple.com/documentation/metalperformanceshaders). The GPU shares the host memory, so there is
-no need to copy the data back and forth as you would normally do with Cuda or OpenCL. The weight matrices are directly
-available to be used by the GPU.
-
-However, to my surprise, using MPS together with the CPU did not lead to any performance improvement at all. My
-conclusion was that the 8-thread NEON CPU computation is already saturating the memory bandwidth of the M1 and since
-the CPU and the GPU on the MacBook are sharing that bandwidth, it does not help to offload the computation to the GPU.
-Another observation was that the MPS GPU matrix multiplication using 16-bit floats had the same performance as the
-8-thread NEON CPU implementation. Again, I explain this with a saturated memory channel. But of course, my explanation
-could be totally wrong and somehow the implementation wasn't utilizing the resources correctly.
-
-In the end, I decided to not use MPS or the GPU all together.
-
-### Zero memory allocations
-
-Another property of my implementation is that it does not perform any memory allocations once the model is loaded into
-memory. All required memory is allocated at the start of the program with a single `malloc` (technically 2 calls, but
-that is not important).
-
-## Usage
-
-If you want to give this a try and you are on Linux or Mac OS, simply follow these instructions:
-
-```bash
-# Clone the ggml library and build the gpt-j example
-git clone https://github.com/ggerganov/ggml
-cd ggml
-mkdir build && cd build
-cmake ..
-make -j4 gpt-j
-
-# Download the ggml-compatible GPT-J 6B model (requires 12GB disk space)
-../examples/gpt-j/download-ggml-model.sh 6B
-
-# Run the inference (requires 16GB of CPU RAM)
-./bin/gpt-j -m models/gpt-j-6B/ggml-model.bin -p "This is an example"
-
-# Input prompt through pipe and run the inference.
-echo "This is an example" > prompt.txt
-cat prompt.txt | ./bin/gpt-j -m models/gpt-j-6B/ggml-model.bin
-```
-
-To run the `gpt-j` tool, you need the 12GB `ggml-model.bin` file which contains the GPT-J model in
-[ggml](https://github.com/ggerganov/ggml) compatible format. In the instructions above, the binary file
-is downloaded from my repository on Hugging Face using the [download-ggml-model.sh](download-ggml-model.sh) script.
-You can also, download the file manually from this link:
-
-https://huggingface.co/ggerganov/ggml/tree/main
-
----
-
-Alternatively, if you don't want to download the 12GB ggml model file, you can perform the conversion yourself using
-python.
-
-First, you need to download the full GPT-J model from here: https://huggingface.co/EleutherAI/gpt-j-6B
-
-Note that the full model is quite big - about 72 GB. After you download it, you need to convert it to ggml format using
-the [convert-h5-to-ggml.py](convert-h5-to-ggml.py) script. This will generate the `ggml-model.bin` file, which you can
-then use with the `gpt-j` program.
-
-
-## GPT-2
-
-I also implemented a tool for CPU inference using the smaller GPT-2 models. They have worse quality compared to GPT-J,
-but are much faster to execute.
-
-For example, the Small GPT-2 model is only 240 MB big and the inference speed on my MacBook is about 200 tokens/sec.
-
-For more details, checkout the GPT-2 example here: [gpt-2](https://github.com/ggerganov/ggml/tree/master/examples/gpt-2)

+ 0 - 173
ggml/examples/gpt-j/convert-h5-to-ggml.py

@@ -1,173 +0,0 @@
-# Convert GPT-J-6B h5 transformer model to ggml format
-#
-# Load the model using GPTJForCausalLM.
-# Iterate over all variables and write them to a binary file.
-#
-# For each variable, write the following:
-#   - Number of dimensions (int)
-#   - Name length (int)
-#   - Dimensions (int[n_dims])
-#   - Name (char[name_length])
-#   - Data (float[n_dims])
-#
-# By default, the bigger matrices are converted to 16-bit floats.
-# This can be disabled by adding the "use-f32" CLI argument.
-#
-# At the start of the ggml file we write the model parameters
-# and vocabulary.
-#
-
-import sys
-import struct
-import json
-import torch
-import numpy as np
-
-from transformers import GPTJForCausalLM
-
-# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-if len(sys.argv) < 3:
-    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
-    print("  ftype == 0 -> float32")
-    print("  ftype == 1 -> float16")
-    sys.exit(1)
-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-fname_out = sys.argv[1] + "/ggml-model.bin"
-
-with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
-    encoder = json.load(f)
-
-with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
-    encoder_added = json.load(f)
-
-with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-# possible data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-#
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-ftype = 1
-if len(sys.argv) > 2:
-    ftype = int(sys.argv[2])
-    if ftype < 0 or ftype > 1:
-        print("Invalid ftype: " + str(ftype))
-        sys.exit(1)
-    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
-
-
-model = GPTJForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
-#print (model)
-
-list_vars = model.state_dict()
-#print (list_vars)
-
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-fout.write(struct.pack("i", hparams["vocab_size"]))
-fout.write(struct.pack("i", hparams["n_positions"]))
-fout.write(struct.pack("i", hparams["n_embd"]))
-fout.write(struct.pack("i", hparams["n_head"]))
-fout.write(struct.pack("i", hparams["n_layer"]))
-fout.write(struct.pack("i", hparams["rotary_dim"]))
-fout.write(struct.pack("i", ftype))
-
-byte_encoder = bytes_to_unicode()
-byte_decoder = {v:k for k, v in byte_encoder.items()}
-
-fout.write(struct.pack("i", len(encoder) + len(encoder_added)))
-
-for key in encoder:
-    text = bytearray([byte_decoder[c] for c in key])
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-
-for key in encoder_added:
-    text = bytearray([byte_decoder[c] for c in key])
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-
-for name in list_vars.keys():
-    data = list_vars[name].squeeze().numpy()
-    print("Processing variable: " + name + " with shape: ", data.shape)
-
-    # we don't need these
-    if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
-        print("  Skipping variable: " + name)
-        continue
-
-    n_dims = len(data.shape);
-
-    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype_cur = 0;
-    if ftype != 0:
-        if name[-7:] == ".weight" and n_dims == 2:
-            print("  Converting to float16")
-            data = data.astype(np.float16)
-            ftype_cur = 1
-        else:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-    else:
-        if data.dtype != np.float32:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-
-    # for efficiency - transpose these matrices:
-    # (note - with latest ggml this is no longer more efficient, so disabling it)
-    #  "transformer.h.*.mlp.fc_in.weight"
-    #  "transformer.h.*.attn.out_proj.weight"
-    #  "transformer.h.*.attn.q_proj.weight"
-    #  "transformer.h.*.attn.k_proj.weight"
-    #  "transformer.h.*.attn.v_proj.weight"
-    #if name.endswith(".mlp.fc_in.weight")     or \
-    #   name.endswith(".attn.out_proj.weight") or \
-    #   name.endswith(".attn.q_proj.weight")   or \
-    #   name.endswith(".attn.k_proj.weight")   or \
-    #   name.endswith(".attn.v_proj.weight"):
-    #    print("  Transposing")
-    #    data = data.transpose()
-
-    # header
-    str = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    fout.write(str);
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")

+ 0 - 69
ggml/examples/gpt-j/download-ggml-model.sh

@@ -1,69 +0,0 @@
-#!/bin/bash
-
-# This script downloads GPT-J model files that have already been converted to ggml format.
-# This way you don't have to convert them yourself.
-#
-# If you want to download the original GPT-J model files, use the "download-model.sh" script instead.
-
-#src="https://ggml.ggerganov.com"
-#pfx="ggml-model-gpt-j"
-
-src="https://huggingface.co/ggerganov/ggml"
-pfx="resolve/main/ggml-model-gpt-j"
-
-ggml_path=$(dirname $(realpath $0))
-
-# GPT-J models
-models=( "6B" )
-
-# list available models
-function list_models {
-    printf "\n"
-    printf "  Available models:"
-    for model in "${models[@]}"; do
-        printf " $model"
-    done
-    printf "\n\n"
-}
-
-if [ "$#" -ne 1 ]; then
-    printf "Usage: $0 <model>\n"
-    list_models
-
-    exit 1
-fi
-
-model=$1
-
-if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
-    printf "Invalid model: $model\n"
-    list_models
-
-    exit 1
-fi
-
-# download ggml model
-
-printf "Downloading ggml model $model ...\n"
-
-mkdir -p models/gpt-j-$model
-
-if [ -x "$(command -v wget)" ]; then
-    wget --quiet --show-progress -O models/gpt-j-$model/ggml-model.bin $src/$pfx-$model.bin
-elif [ -x "$(command -v curl)" ]; then
-    curl -L --output models/gpt-j-$model/ggml-model.bin $src/$pfx-$model.bin
-else
-    printf "Either wget or curl is required to download models.\n"
-    exit 1
-fi
-
-if [ $? -ne 0 ]; then
-    printf "Failed to download ggml model $model \n"
-    printf "Please try again later or download the original GPT-J model files and convert them yourself.\n"
-    exit 1
-fi
-
-printf "Done! Model '$model' saved in 'models/gpt-j-$model/ggml-model.bin'\n"
-printf "You can now use it like this:\n\n"
-printf "  $ ./bin/gpt-j -m models/gpt-j-$model/ggml-model.bin -p \"This is an example\"\n"
-printf "\n"

+ 0 - 11
ggml/examples/gpt-j/download-model.sh

@@ -1,11 +0,0 @@
-#!/bin/bash
-
-printf "To obtain the GPT-J 6B model files, please visit: https://huggingface.co/EleutherAI/gpt-j-6B\n\n"
-
-printf "The model is very big. For example, the reposirory above is 72GB in size.\n"
-printf "If you are sure that you want to clone it, simply run the following command:\n\n"
-
-printf " $ git clone https://huggingface.co/EleutherAI/gpt-j-6B models/gpt-j-6B\n\n"
-
-printf "Alternatively, use the 'download-ggml-model.sh' script to download a 12GB ggml version of the model.\n"
-printf "This version is enough to run inference using the ggml library.\n\n"

+ 0 - 748
ggml/examples/gpt-j/main.cpp

@@ -1,748 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-
-// default hparams (GPT-J 6B)
-struct gptj_hparams {
-    int32_t n_vocab = 50400;
-    int32_t n_ctx   = 2048;
-    int32_t n_embd  = 4096;
-    int32_t n_head  = 16;
-    int32_t n_layer = 28;
-    int32_t n_rot   = 64;
-    int32_t ftype   = 1;
-    float   eps     = 1e-5f;
-};
-
-struct gptj_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    // attention
-    struct ggml_tensor * c_attn_q_proj_w;
-    struct ggml_tensor * c_attn_k_proj_w;
-    struct ggml_tensor * c_attn_v_proj_w;
-
-    struct ggml_tensor * c_attn_proj_w;
-
-    // ff
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct gptj_model {
-    gptj_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte; // position embedding
-
-    struct ggml_tensor * lmh_g; // language model head
-    struct ggml_tensor * lmh_b; // language model bias
-
-    std::vector<gptj_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-// load the model's weights from a file
-bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab) {
-    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr   = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        fin.read((char *) &n_vocab, sizeof(n_vocab));
-
-        if (n_vocab != model.hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
-            return false;
-        }
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *) buf.data(), len);
-            word.assign(buf.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
-
-        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte
-
-        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype);         // lmh_g
-        ctx_size +=        n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
-
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_q_proj_w
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_k_proj_w
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_v_proj_w
-
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
-
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
-
-        ctx_size += (5 + 10*n_layer)*512; // object overhead
-
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        model.wte    = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-
-        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        model.lmh_g  = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        model.lmh_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab);
-
-        // map by name
-        model.tensors["transformer.wte.weight"] = model.wte;
-
-        model.tensors["transformer.ln_f.weight"] = model.ln_f_g;
-        model.tensors["transformer.ln_f.bias"]   = model.ln_f_b;
-
-        model.tensors["lm_head.weight"] = model.lmh_g;
-        model.tensors["lm_head.bias"]   = model.lmh_b;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.ln_1_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_attn_q_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
-            layer.c_attn_k_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
-            layer.c_attn_v_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
-
-            layer.c_attn_proj_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
-
-            layer.c_mlp_fc_w      = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
-            layer.c_mlp_fc_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-            layer.c_mlp_proj_w    = ggml_new_tensor_2d(ctx, wtype,         4*n_embd,   n_embd);
-            layer.c_mlp_proj_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            // map by name
-            model.tensors["transformer.h." + std::to_string(i) + ".ln_1.weight"]          = layer.ln_1_g;
-            model.tensors["transformer.h." + std::to_string(i) + ".ln_1.bias"]            = layer.ln_1_b;
-
-            model.tensors["transformer.h." + std::to_string(i) + ".attn.q_proj.weight"]   = layer.c_attn_q_proj_w;
-            model.tensors["transformer.h." + std::to_string(i) + ".attn.k_proj.weight"]   = layer.c_attn_k_proj_w;
-            model.tensors["transformer.h." + std::to_string(i) + ".attn.v_proj.weight"]   = layer.c_attn_v_proj_w;
-
-            model.tensors["transformer.h." + std::to_string(i) + ".attn.out_proj.weight"] = layer.c_attn_proj_w;
-
-            model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.weight"]     = layer.c_mlp_fc_w;
-            model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.bias"]       = layer.c_mlp_fc_b;
-
-            model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.weight"]    = layer.c_mlp_proj_w;
-            model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.bias"]      = layer.c_mlp_proj_b;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int n_mem      = n_layer*n_ctx;
-        const int n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-
-    // load weights
-    {
-        int n_tensors = 0;
-        size_t total_size = 0;
-
-        printf("%s: ", __func__);
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            auto tensor = model.tensors[name];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-            //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.c_str(), ne[0], ne[1], ttype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
-            total_size += ggml_nbytes(tensor);
-            if (++n_tensors % 8 == 0) {
-                printf(".");
-                fflush(stdout);
-            }
-        }
-
-        printf(" done\n");
-
-        printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-// The GPT-J model requires about 16MB of memory per input token.
-//
-bool gptj_eval(
-        const gptj_model & model,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w,
-              size_t                     & mem_per_token) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-    const int n_vocab = hparams.n_vocab;
-    const int n_rot   = hparams.n_rot;
-
-    static size_t buf_size = 256u*1024*1024;
-    static void * buf = malloc(buf_size);
-
-    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = {};
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-
-    // wte
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        // norm
-        {
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
-
-            // cur = ln_1_g*cur + ln_1_b
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
-                        cur),
-                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
-        }
-
-        struct ggml_tensor * inpSA = cur;
-
-        // self-attention
-        {
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
-
-            // store key and value to memory
-            {
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur));
-
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
-                        (   n_ctx)*ggml_element_size(model.memory_v),
-                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
-
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
-                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
-                        );
-
-            // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, model.memory_v,
-                        n_past + N, n_embd/n_head, n_head,
-                        n_ctx*ggml_element_size(model.memory_v),
-                        n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head,
-                        il*n_ctx*ggml_element_size(model.memory_v)*n_embd);
-
-            // KQV = transpose(V) * KQ_soft_max
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection (no bias)
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_proj_w,
-                    cur);
-        }
-
-        struct ggml_tensor * inpFF = cur;
-
-        // feed-forward network
-        // this is independent of the self-attention result, so it could be done in parallel to the self-attention
-        {
-            // note here we pass inpSA instead of cur
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_fc_w,
-                    inpSA);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
-                    cur);
-
-            // GELU activation
-            cur = ggml_gelu(ctx0, cur);
-
-            // projection
-            // cur = proj_w*cur + proj_b
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
-                    cur);
-        }
-
-        // self-attention + FF
-        cur  = ggml_add(ctx0, cur, inpFF);
-
-        // input for next layer
-        inpL = ggml_add(ctx0, cur, inpL);
-    }
-
-    // norm
-    {
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
-                    inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
-    }
-
-    // lm_head
-    {
-        inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
-
-        inpL = ggml_add(ctx0,
-                ggml_repeat(ctx0, model.lmh_b, inpL),
-                inpL);
-    }
-
-    // logits -> probs
-    //inpL = ggml_soft_max_inplace(ctx0, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-j.dot");
-    //}
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // return result for just the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
-    }
-    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
-
-    ggml_free(ctx0);
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-    params.model = "models/gpt-j-6B/ggml-model.bin";
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    gptj_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gptj_model_load(params.model, model, vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-        test_gpt_tokenizer(vocab, params.token_test);
-    }
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-    printf("\n");
-
-    std::vector<gpt_vocab::id> embd;
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-    gptj_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
-
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!gptj_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int   top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp  = params.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (int32_t(embd.size()) > params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str());
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (embd.back() == 50256) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}

+ 0 - 182
ggml/examples/gpt-j/quantize.cpp

@@ -1,182 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include <regex>
-
-// default hparams (GPT-J 6B)
-struct gptj_hparams {
-    int32_t n_vocab = 50400;
-    int32_t n_ctx   = 2048;
-    int32_t n_embd  = 4096;
-    int32_t n_head  = 16;
-    int32_t n_layer = 28;
-    int32_t n_rot   = 64;
-    int32_t ftype   = 1;
-};
-
-// quantize a model
-bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
-    gpt_vocab vocab;
-
-    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
-
-    auto finp = std::ifstream(fname_inp, std::ios::binary);
-    if (!finp) {
-        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    auto fout = std::ofstream(fname_out, std::ios::binary);
-    if (!fout) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        finp.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
-            return false;
-        }
-
-        fout.write((char *) &magic, sizeof(magic));
-    }
-
-    gptj_hparams hparams;
-
-    // load hparams
-    {
-        finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        finp.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        finp.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
-        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
-
-        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
-        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
-        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
-
-        fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fout.write((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        finp.read ((char *) &n_vocab, sizeof(n_vocab));
-        fout.write((char *) &n_vocab, sizeof(n_vocab));
-
-        if (n_vocab != hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
-            return false;
-        }
-
-        std::string word;
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            finp.read ((char *) &len, sizeof(len));
-            fout.write((char *) &len, sizeof(len));
-
-            word.resize(len);
-            finp.read ((char *) word.data(), len);
-            fout.write((char *) word.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // regexes of tensor names to be quantized
-    const std::vector<std::string> to_quant = {
-        ".*weight",
-    };
-
-    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
-        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    finp.close();
-    fout.close();
-
-    return true;
-}
-
-// usage:
-//  ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type
-//
-int main(int argc, char ** argv) {
-    if (argc != 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        ggml_print_ftypes(stderr);
-        return 1;
-    }
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
-
-    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    int64_t t_quantize_us = 0;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gptj_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
-            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
-            return 1;
-        }
-
-        t_quantize_us = ggml_time_us() - t_start_us;
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n");
-        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    return 0;
-}

+ 0 - 13
ggml/examples/gpt-neox/CMakeLists.txt

@@ -1,13 +0,0 @@
-#
-# gpt-neox
-
-set(TEST_TARGET gpt-neox)
-add_executable(${TEST_TARGET} main.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
-#
-# gpt-neox-quantize
-
-set(TEST_TARGET gpt-neox-quantize)
-add_executable(${TEST_TARGET} quantize.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)

+ 0 - 110
ggml/examples/gpt-neox/README.md

@@ -1,110 +0,0 @@
-# GPT-NeoX
-
-Transformer architecture: GPT-NeoX
-
-Ref: https://github.com/stability-AI/stableLM/#stablelm-alpha
-
-## Usage
-
-```bash
-# get the repo and build it
-git clone https://github.com/ggerganov/ggml
-cd ggml
-mkdir build && cd build
-cmake ..
-make -j
-
-# get the StableLM 3B Alpha model
-git clone https://huggingface.co/stabilityai/gpt_neox-base-alpha-3b
-
-# install Python dependencies
-python3 -m pip install -r ../requirements.txt
-
-# convert model to FP16
-python3 ../examples/gpt-neox/convert-h5-to-ggml.py ./stablelm-base-alpha-3b/ 1
-
-# run inference using FP16 precision
-make -j && ./bin/gpt-neox -m ./stablelm-base-alpha-3b/ggml-model-f16.bin -p "I believe the meaning of life is" -t 8 -n 64
-
-main: seed = 1681940611
-gpt_neox_model_load: loading model from 'models/stablelm-base-alpha-3b/ggml-model-f16.bin' - please wait ...
-gpt_neox_model_load: n_vocab = 50688
-gpt_neox_model_load: n_ctx   = 4096
-gpt_neox_model_load: n_embd  = 4096
-gpt_neox_model_load: n_head  = 32
-gpt_neox_model_load: n_layer = 16
-gpt_neox_model_load: n_rot   = 32
-gpt_neox_model_load: ftype   = 1
-gpt_neox_model_load: ggml ctx size = 10011.10 MB
-gpt_neox_model_load: memory_size =  2048.00 MB, n_mem = 65536
-gpt_neox_model_load: ................................ done
-gpt_neox_model_load: model size =  6939.28 MB / num tensors = 260
-main: number of tokens in prompt = 7
-main: token[0] =     42, I
-main: token[1] =   2868,  believe
-main: token[2] =    253,  the
-main: token[3] =   4495,  meaning
-main: token[4] =    273,  of
-main: token[5] =   1495,  life
-main: token[6] =    310,  is
-
-I believe the meaning of life is to grow, to find a way, to love, to find an appreciation for life, and to live it with all of its beauty.
-
-For I am the child of God. I am the offspring of God's love. I am the offspring of the light of the world. I am the offspring of the
-
-main: mem per token = 12186760 bytes
-main:     load time =  2118.55 ms
-main:   sample time =     9.59 ms
-main:  predict time =  4474.07 ms / 63.92 ms per token
-main:    total time =  6911.26 ms
-```
-
-## 5-bit integer quantization mode
-
-```bash
-# quantize the model to 5-bits using Q5_0 quantization
-./bin/gpt-neox-quantize ./stablelm-base-alpha-3b/ggml-model-f16.bin ./stablelm-base-alpha-3b/ggml-model-q5_0.bin q5_0
-
-# run the quantized model
-./bin/gpt-neox -m ./stablelm-base-alpha-3b/ggml-model-q5_0.bin -p "I believe the meaning of life is" -t 8 -n 64
-
-main: seed = 1682021489
-gpt_neox_model_load: loading model from 'models/stablelm-base-alpha-3b/ggml-model-q5_0.bin' - please wait ...
-gpt_neox_model_load: n_vocab = 50688
-gpt_neox_model_load: n_ctx   = 4096
-gpt_neox_model_load: n_embd  = 4096
-gpt_neox_model_load: n_head  = 32
-gpt_neox_model_load: n_layer = 16
-gpt_neox_model_load: n_rot   = 32
-gpt_neox_model_load: ftype   = 6
-gpt_neox_model_load: ggml ctx size = 5676.10 MB
-gpt_neox_model_load: memory_size =  1024.00 MB, n_mem = 65536
-gpt_neox_model_load: ........................ done
-gpt_neox_model_load: model size =  2604.28 MB / num tensors = 196
-main: number of tokens in prompt = 7
-main: token[0] =     42, I
-main: token[1] =   2868,  believe
-main: token[2] =    253,  the
-main: token[3] =   4495,  meaning
-main: token[4] =    273,  of
-main: token[5] =   1495,  life
-main: token[6] =    310,  is
-
-I believe the meaning of life is to love and be loved. The last three verses were enough to tie us all together. If you love someone you love them all. There are some things in this world that are just not equal in Heaven. - Be here in this moment.
-
-This world is not what is outside of us. It is what
-
-main: mem per token = 12958024 bytes
-main:     load time =   850.51 ms
-main:   sample time =     9.95 ms
-main:  predict time =  3103.81 ms / 44.34 ms per token
-main:    total time =  4177.68 ms
-
-```
-
-## Notes
-
-- No guarantees for correctness
-- The tokenizer is currently hacked - probably works only for English
-- Non-parallel residual is not supported
-- Contributions and improvements are welcome

+ 0 - 107
ggml/examples/gpt-neox/convert-h5-to-ggml.py

@@ -1,107 +0,0 @@
-import sys
-import struct
-import json
-import numpy as np
-
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-if len(sys.argv) < 3:
-    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
-    print("  ftype == 0 -> float32")
-    print("  ftype == 1 -> float16")
-    sys.exit(1)
-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-fname_out = sys.argv[1] + "/ggml-model.bin"
-
-with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-# possible data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-#
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-ftype = 1
-if len(sys.argv) > 2:
-    ftype = int(sys.argv[2])
-    if ftype < 0 or ftype > 1:
-        print("Invalid ftype: " + str(ftype))
-        sys.exit(1)
-    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
-
-
-tokenizer = AutoTokenizer.from_pretrained(dir_model)
-model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
-
-list_vars = model.state_dict()
-for name in list_vars.keys():
-    print(name, list_vars[name].shape, list_vars[name].dtype)
-
-fout = open(fname_out, "wb")
-
-print(hparams)
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-fout.write(struct.pack("i", hparams["vocab_size"]))
-fout.write(struct.pack("i", hparams["max_position_embeddings"]))
-fout.write(struct.pack("i", hparams["hidden_size"]))
-fout.write(struct.pack("i", hparams["num_attention_heads"]))
-fout.write(struct.pack("i", hparams["num_hidden_layers"]))
-fout.write(struct.pack("i", int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"]))))
-fout.write(struct.pack("i", hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True))
-fout.write(struct.pack("i", ftype))
-
-# TODO: temporary hack to not deal with implementing the tokenizer
-for i in range(hparams["vocab_size"]):
-    text = tokenizer.decode([i]).encode('utf-8')
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-
-for name in list_vars.keys():
-    data = list_vars[name].squeeze().numpy()
-    print("Processing variable: " + name + " with shape: ", data.shape)
-
-    # we don't need these
-    if name.endswith(".attention.masked_bias") or     \
-       name.endswith(".attention.bias") or \
-       name.endswith(".attention.rotary_emb.inv_freq"):
-        print("  Skipping variable: " + name)
-        continue
-
-    n_dims = len(data.shape)
-
-    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype_cur = 0
-    if ftype != 0:
-        if name[-7:] == ".weight" and n_dims == 2:
-            print("  Converting to float16")
-            data = data.astype(np.float16)
-            ftype_cur = 1
-        else:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-    else:
-        if data.dtype != np.float32:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-
-    # header
-    str = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    fout.write(str)
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")

+ 0 - 814
ggml/examples/gpt-neox/main.cpp

@@ -1,814 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <cinttypes>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// default hparams (StableLM 3B)
-struct gpt_neox_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 4096;
-    int32_t n_embd  = 4096;
-    int32_t n_head  = 32;
-    int32_t n_layer = 16;
-    int32_t n_rot   = 32; // rotary_pct * (n_embd / n_head)
-    int32_t par_res = 1; // 1 = true, 0 = false
-    int32_t ftype   = 1;
-    float   eps     = 1e-5f;
-};
-
-struct gpt_neox_layer {
-    // pre normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // post normalization
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // ff
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct gpt_neox_model {
-    gpt_neox_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte; // position embedding
-
-    struct ggml_tensor * lmh_g; // language model head
-    //struct ggml_tensor * lmh_b; // language model bias
-
-    std::vector<gpt_neox_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-// load the model's weights from a file
-bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab) {
-    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fin.read((char *) &hparams.par_res, sizeof(hparams.par_res));
-        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
-        printf("%s: par_res = %d\n", __func__, hparams.par_res);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr   = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    {
-        const int32_t n_vocab = model.hparams.n_vocab;
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *) buf.data(), len);
-            word.assign(buf.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const size_t n_embd  = hparams.n_embd;
-        const size_t n_layer = hparams.n_layer;
-        const size_t n_ctx   = hparams.n_ctx;
-        const size_t n_vocab = hparams.n_vocab;
-
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
-
-        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte
-
-        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype);           // lmh_g
-        //ctx_size +=        n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
-
-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
-
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_proj_w
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
-
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
-
-        ctx_size += (6 + 16*n_layer)*1024; // object overhead
-
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        model.wte    = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-
-        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        model.lmh_g  = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        //model.lmh_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab);
-
-        // map by name
-        model.tensors["gpt_neox.embed_in.weight"] = model.wte;
-
-        model.tensors["gpt_neox.final_layer_norm.weight"] = model.ln_f_g;
-        model.tensors["gpt_neox.final_layer_norm.bias"]   = model.ln_f_b;
-
-        model.tensors["embed_out.weight"] = model.lmh_g;
-        //model.tensors["lm_head.bias"]   = model.lmh_b;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.ln_1_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_attn_attn_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
-            layer.c_attn_attn_b   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
-
-            layer.c_attn_proj_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
-            layer.c_attn_proj_b   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.ln_2_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_2_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_mlp_fc_w      = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
-            layer.c_mlp_fc_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-            layer.c_mlp_proj_w    = ggml_new_tensor_2d(ctx, wtype,         4*n_embd,   n_embd);
-            layer.c_mlp_proj_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            // map by name
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.weight"] = layer.ln_1_g;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.bias"]   = layer.ln_1_b;
-
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.weight"] = layer.c_attn_attn_w;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.bias"]   = layer.c_attn_attn_b;
-
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.weight"] = layer.c_attn_proj_w;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.bias"]   = layer.c_attn_proj_b;
-
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.weight"] = layer.ln_2_g;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.bias"]   = layer.ln_2_b;
-
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.weight"] = layer.c_mlp_fc_w;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.bias"]   = layer.c_mlp_fc_b;
-
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.weight"] = layer.c_mlp_proj_w;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.bias"]   = layer.c_mlp_proj_b;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int64_t n_mem      = n_layer*n_ctx;
-        const int64_t n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-
-    // load weights
-    {
-        int n_tensors = 0;
-        size_t total_size = 0;
-
-        printf("%s: ", __func__);
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            auto tensor = model.tensors[name];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n",
-                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-            total_size += ggml_nbytes(tensor);
-            if (++n_tensors % 8 == 0) {
-                printf(".");
-                fflush(stdout);
-            }
-        }
-
-        printf(" done\n");
-
-        printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-
-// feed-forward network
-ggml_tensor * gpt_neox_ff(
-        const gpt_neox_layer & layer,
-        ggml_context         * ctx0,
-        ggml_tensor          * inp,
-        float                  eps) {
-    ggml_tensor * cur = ggml_norm(ctx0, inp, eps);
-
-    cur = ggml_add(ctx0,
-        ggml_mul(ctx0,
-            ggml_repeat(ctx0, layer.ln_2_g, cur),
-            cur),
-        ggml_repeat(ctx0, layer.ln_2_b, cur));
-
-    cur = ggml_mul_mat(ctx0,
-            layer.c_mlp_fc_w,
-            cur);
-
-    cur = ggml_add(ctx0,
-            ggml_repeat(ctx0, layer.c_mlp_fc_b, cur),
-            cur);
-
-    // GELU activation
-    cur = ggml_gelu(ctx0, cur);
-
-    // projection
-    // cur = proj_w*cur + proj_b
-    cur = ggml_mul_mat(ctx0,
-            layer.c_mlp_proj_w,
-            cur);
-
-    cur = ggml_add(ctx0,
-            ggml_repeat(ctx0, layer.c_mlp_proj_b, cur),
-            cur);
-    return cur;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool gpt_neox_eval(
-        const gpt_neox_model & model,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w,
-              size_t                     & mem_per_token) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-    const int n_vocab = hparams.n_vocab;
-    const int n_rot   = hparams.n_rot;
-
-    static size_t buf_size = 256u*1024*1024;
-    static void * buf = malloc(buf_size);
-
-    // use 2 scratch buffers
-    // TODO: very hacky solution - reimplement in a more elegant way
-    static size_t scr0_size = 256u*1024*1024;
-    static void * scr0 = malloc(scr0_size);
-
-    static size_t scr1_size = 256u*1024*1024;
-    static void * scr1 = malloc(scr1_size);
-
-    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = {};
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-
-    // wte
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
-
-        // self-attention
-        {
-            {
-                cur = ggml_norm(ctx0, inpL, hparams.eps);
-
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
-                            cur),
-                        ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
-            }
-
-            // compute QKV
-            {
-                cur = ggml_mul_mat(ctx0,
-                        model.layers[il].c_attn_attn_w,
-                        cur);
-
-                cur = ggml_add(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
-                        cur);
-            }
-
-            struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head));
-            struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head));
-            struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
-
-            // using mode = 2 for GPT-NeoX mode
-            Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2, 0);
-            Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2, 0);
-
-            // store key and value to memory
-            {
-                Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
-
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
-                        (   n_ctx)*ggml_element_size(model.memory_v),
-                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
-
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
-                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
-                        );
-
-            // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, model.memory_v,
-                        n_past + N, n_embd/n_head, n_head,
-                        n_ctx*ggml_element_size(model.memory_v),
-                        n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head,
-                        il*n_ctx*ggml_element_size(model.memory_v)*n_embd);
-
-            // KQV = transpose(V) * KQ_soft_max
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection
-            {
-                cur = ggml_mul_mat(ctx0,
-                        model.layers[il].c_attn_proj_w,
-                        cur);
-
-                cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur);
-            }
-        }
-
-        ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
-
-        if (hparams.par_res == 0) {
-            struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
-
-            cur = gpt_neox_ff(model.layers[il], ctx0, inpFF, hparams.eps);
-
-            // input for next layer
-            inpL = ggml_add(ctx0, cur, inpFF);
-        } else {
-            struct ggml_tensor * inpFF = cur;
-
-            // this is independent of the self-attention result, so it could be done in parallel to the self-attention
-            // note here we pass inpL instead of cur
-            cur = gpt_neox_ff(model.layers[il], ctx0, inpL, hparams.eps);
-
-            // layer input + FF
-            cur  = ggml_add(ctx0, cur, inpFF);
-
-            // input for next layer
-            inpL = ggml_add(ctx0, cur, inpL);
-        }
-    }
-
-    ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
-
-    // norm
-    {
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
-                    inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
-    }
-
-    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-
-    // lm_head
-    {
-        inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
-
-        //inpL = ggml_add(ctx0,
-        //        ggml_repeat(ctx0, model.lmh_b, inpL),
-        //        inpL);
-    }
-
-    // logits -> probs
-    //inpL = ggml_soft_max_inplace(ctx0, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // return result for just the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
-    }
-    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
-
-    ggml_free(ctx0);
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-    params.model = "models/stablelm-base-alpha-3b/ggml-model-f16.bin";
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    gpt_neox_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt_neox_model_load(params.model, model, vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-        test_gpt_tokenizer(vocab, params.token_test);
-    }
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-    for (size_t i = 0; i < embd_inp.size(); i++) {
-        printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
-    }
-    printf("\n");
-
-    std::vector<gpt_vocab::id> embd;
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-    gpt_neox_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
-
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!gpt_neox_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int   top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp  = params.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (int32_t(embd.size()) > params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str());
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (embd.back() == 0) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}

+ 0 - 178
ggml/examples/gpt-neox/quantize.cpp

@@ -1,178 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include <regex>
-
-// default hparams (StableLM 3B)
-struct gpt_neox_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 4096;
-    int32_t n_embd  = 4096;
-    int32_t n_head  = 32;
-    int32_t n_layer = 16;
-    int32_t n_rot   = 32; // 0.25 * (n_embd / n_head)
-    int32_t par_res = 1; // 1 = true, 0 = false
-    int32_t ftype   = 1;
-};
-
-// quantize a model
-bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
-    gpt_vocab vocab;
-
-    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
-
-    auto finp = std::ifstream(fname_inp, std::ios::binary);
-    if (!finp) {
-        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    auto fout = std::ofstream(fname_out, std::ios::binary);
-    if (!fout) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        finp.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
-            return false;
-        }
-
-        fout.write((char *) &magic, sizeof(magic));
-    }
-
-    gpt_neox_hparams hparams;
-
-    // load hparams
-    {
-        finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        finp.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        finp.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        finp.read((char *) &hparams.par_res, sizeof(hparams.par_res));
-        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
-        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
-
-        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
-        printf("%s: par_res     = %d\n", __func__, hparams.par_res);
-        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
-        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
-        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
-
-        fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fout.write((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fout.write((char *) &hparams.par_res, sizeof(hparams.par_res));
-        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
-    }
-
-    // load vocab
-    {
-        const int32_t n_vocab = hparams.n_vocab;
-
-        std::string word;
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            finp.read ((char *) &len, sizeof(len));
-            fout.write((char *) &len, sizeof(len));
-
-            word.resize(len);
-            finp.read ((char *) word.data(), len);
-            fout.write((char *) word.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // regexes of tensor names to be quantized
-    const std::vector<std::string> to_quant = {
-        ".*weight",
-    };
-
-    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
-        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    finp.close();
-    fout.close();
-
-    return true;
-}
-
-// usage:
-//  ./gpt-neox-quantize models/stalellm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type
-//
-int main(int argc, char ** argv) {
-    if (argc != 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        ggml_print_ftypes(stderr);
-        return 1;
-    }
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
-
-    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    int64_t t_quantize_us = 0;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt_neox_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
-            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
-            return 1;
-        }
-
-        t_quantize_us = ggml_time_us() - t_start_us;
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n");
-        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    return 0;
-}

+ 0 - 40
ggml/examples/mnist/CMakeLists.txt

@@ -1,40 +0,0 @@
-#
-# mnist
-
-set(TEST_TARGET mnist)
-add_executable(${TEST_TARGET} main.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
-
-#
-# mnist-cnn
-
-set(TEST_TARGET mnist-cnn)
-add_executable(${TEST_TARGET} main-cnn.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
-
-#
-# mnist-cpu
-
-set(TEST_TARGET mnist-cpu)
-add_executable(${TEST_TARGET} main-cpu.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-
-if (APPLE)
-    #
-    # mnist-mtl
-
-    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
-    find_library(METAL_FRAMEWORK    Metal      REQUIRED)
-    find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
-    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
-
-    set(TEST_TARGET mnist-mtl)
-    add_executable(${TEST_TARGET} main-mtl.cpp main-mtl.h main-mtl.m)
-    target_link_libraries(${TEST_TARGET} PRIVATE
-        ggml
-        ${FOUNDATION_LIBRARY}
-        ${METAL_FRAMEWORK}
-        ${METALKIT_FRAMEWORK}
-        ${METALPERFORMANCE_FRAMEWORK}
-    )
-endif()

+ 0 - 128
ggml/examples/mnist/README.md

@@ -1,128 +0,0 @@
-# MNIST Examples for GGML
-
-These are simple examples of how to use GGML for inferencing.
-The first example uses convolutional neural network (CNN), the second one uses fully connected neural network.
-
-## Building the examples
-
-```bash
-git clone https://github.com/ggerganov/ggml
-cd ggml
-mkdir build && cd build
-cmake ..
-make -j4 mnist-cnn mnist
-```
-
-## MNIST with CNN
-
-This implementation achieves ~99% accuracy on the MNIST test set.
-
-### Training the model
-
-Use the `mnist-cnn.py` script to train the model and convert it to GGUF format:
-
-```
-$ python3 ../examples/mnist/mnist-cnn.py train mnist-cnn-model
-...
-Keras model saved to 'mnist-cnn-model'
-```
-
-Convert the model to GGUF format:
-
-```
-$ python3 ../examples/mnist/mnist-cnn.py convert mnist-cnn-model
-...
-Model converted and saved to 'mnist-cnn-model.gguf'
-```
-
-### Running the example
-
-```bash
-$ ./bin/mnist-cnn mnist-cnn-model.gguf ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
-main: loaded model in     5.17 ms
-_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ * * * * * _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ _ _ * * * * * * * * _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ * * * * * _ _ _ * * _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ * * _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ * _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ * * _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ * * _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ * * * _ _ _ _ * * * * * _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ * * * * * * * * * _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ * * * * * * * * * * _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ * * * * * * _ _ * * * _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ * * * _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ * * _ _ _ _ _ _ _ _ _ * * _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ * * _ _ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ * * _ _ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ * * * * * * * * * * _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ * * * * * * _ _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
-
-ggml_graph_dump_dot: dot -Tpng mnist-cnn.dot -o mnist-cnn.dot.png && open mnist-cnn.dot.png
-main: predicted digit is 8
-```
-
-Computation graph:
-
-![mnist dot](https://user-images.githubusercontent.com/1991296/263763842-3b679b45-7ca1-4ee9-b19a-82e34396624f.png)
-
-## MNIST with fully connected network
-
-A fully connected layer + relu, followed by a fully connected layer + softmax.
-
-### Training the Model
-
-A Google Colab notebook for training a simple two-layer network to recognize digits is located here. You can
-use this to save a pytorch model to be converted to ggml format.
-
-[Colab](https://colab.research.google.com/drive/12n_8VNJnolBnX5dVS0HNWubnOjyEaFSb?usp=sharing)
-
-GGML "format" is whatever you choose for efficient loading. In our case, we just save the hyperparameters used
-plus the model weights and biases. Run convert-h5-to-ggml.py to convert your pytorch model. The output format is:
-
-- magic constant (int32)
-- repeated list of tensors
-- number of dimensions of tensor (int32)
-- tensor dimension (int32 repeated)
-- values of tensor (int32)
-
-Run ```convert-h5-to-ggml.py mnist_model.state_dict``` where `mnist_model.state_dict` is the saved pytorch model from the Google Colab. For
-quickstart, it is included in the mnist/models directory.
-
-```bash
-mkdir -p models/mnist
-python3 ../examples/mnist/convert-h5-to-ggml.py ../examples/mnist/models/mnist/mnist_model.state_dict
-```
-
-### Running the example
-
-```bash
-./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
-```
-
-Computation graph:
-
-![mnist dot](https://user-images.githubusercontent.com/1991296/231882071-84e29d53-b226-4d73-bdc2-5bd6dcb7efd1.png)
-
-
-## Web demo
-
-The example can be compiled with Emscripten like this:
-
-```bash
-cd examples/mnist
-emcc -I../../include -I../../include/ggml -I../../examples ../../src/ggml.c main.cpp -o web/mnist.js -s EXPORTED_FUNCTIONS='["_wasm_eval","_wasm_random_digit","_malloc","_free"]' -s EXPORTED_RUNTIME_METHODS='["ccall"]' -s ALLOW_MEMORY_GROWTH=1 --preload-file models/mnist
-```
-
-Online demo: https://mnist.ggerganov.com

+ 0 - 63
ggml/examples/mnist/convert-h5-to-ggml.py

@@ -1,63 +0,0 @@
-# Convert MNIS h5 transformer model to ggml format
-#
-# Load the (state_dict) saved model using PyTorch
-# Iterate over all variables and write them to a binary file.
-#
-# For each variable, write the following:
-#   - Number of dimensions (int)
-#   - Name length (int)
-#   - Dimensions (int[n_dims])
-#   - Name (char[name_length])
-#   - Data (float[n_dims])
-#
-# At the start of the ggml file we write the model parameters
-
-import sys
-import struct
-import json
-import numpy as np
-import re
-
-
-import torch
-import torch.nn as nn
-import torchvision.datasets as dsets
-import torchvision.transforms as transforms
-from torch.autograd import Variable
-
-if len(sys.argv) != 2:
-    print("Usage: convert-h5-to-ggml.py model\n")
-    sys.exit(1)
-
-state_dict_file = sys.argv[1]
-fname_out = "models/mnist/ggml-model-f32.bin"
-
-state_dict = torch.load(state_dict_file, map_location=torch.device('cpu'))
-#print (model)
-
-list_vars = state_dict
-print (list_vars)
-
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-
-
-for name in list_vars.keys():
-    data = list_vars[name].squeeze().numpy()
-    print("Processing variable: " + name + " with shape: ", data.shape) 
-    n_dims = len(data.shape);
-   
-    fout.write(struct.pack("i", n_dims))
-    
-    data = data.astype(np.float32)
-    for i in range(n_dims):
-        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")

+ 0 - 169
ggml/examples/mnist/main-cnn.cpp

@@ -1,169 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <string>
-#include <vector>
-#include <algorithm>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-struct mnist_model {
-    struct ggml_tensor * conv2d_1_kernel;
-    struct ggml_tensor * conv2d_1_bias;
-    struct ggml_tensor * conv2d_2_kernel;
-    struct ggml_tensor * conv2d_2_bias;
-    struct ggml_tensor * dense_weight;
-    struct ggml_tensor * dense_bias;
-    struct ggml_context * ctx;
-};
-
-bool mnist_model_load(const std::string & fname, mnist_model & model) {
-    struct gguf_init_params params = {
-        /*.no_alloc   =*/ false,
-        /*.ctx        =*/ &model.ctx,
-    };
-    gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
-    if (!ctx) {
-        fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__);
-        return false;
-    }
-    model.conv2d_1_kernel = ggml_get_tensor(model.ctx, "kernel1");
-    model.conv2d_1_bias = ggml_get_tensor(model.ctx, "bias1");
-    model.conv2d_2_kernel = ggml_get_tensor(model.ctx, "kernel2");
-    model.conv2d_2_bias = ggml_get_tensor(model.ctx, "bias2");
-    model.dense_weight = ggml_get_tensor(model.ctx, "dense_w");
-    model.dense_bias = ggml_get_tensor(model.ctx, "dense_b");
-    return true;
-}
-
-int mnist_eval(
-        const mnist_model & model,
-        const int n_threads,
-        std::vector<float> digit,
-        const char * fname_cgraph
-        )
-{
-    static size_t buf_size = 100000 * sizeof(float) * 4;
-    static void * buf = malloc(buf_size);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = {};
-
-    struct ggml_tensor * input = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, 28, 28, 1, 1);
-    memcpy(input->data, digit.data(), ggml_nbytes(input));
-    ggml_set_name(input, "input");
-    ggml_tensor * cur = ggml_conv_2d(ctx0, model.conv2d_1_kernel, input, 1, 1, 0, 0, 1, 1);
-    cur = ggml_add(ctx0, cur, model.conv2d_1_bias);
-    cur = ggml_relu(ctx0, cur);
-    // Output shape after Conv2D: (26 26 32 1)
-    cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_MAX, 2, 2, 2, 2, 0, 0);
-    // Output shape after MaxPooling2D: (13 13 32 1)
-    cur = ggml_conv_2d(ctx0, model.conv2d_2_kernel, cur, 1, 1, 0, 0, 1, 1);
-    cur = ggml_add(ctx0, cur, model.conv2d_2_bias);
-    cur = ggml_relu(ctx0, cur);
-    // Output shape after Conv2D: (11 11 64 1)
-    cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_MAX, 2, 2, 2, 2, 0, 0);
-    // Output shape after MaxPooling2D: (5 5 64 1)
-    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
-    // Output shape after permute: (64 5 5 1)
-    cur = ggml_reshape_2d(ctx0, cur, 1600, 1);
-    // Final Dense layer
-    cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.dense_weight, cur), model.dense_bias);
-    ggml_tensor * probs = ggml_soft_max(ctx0, cur);
-    ggml_set_name(probs, "probs");
-
-    ggml_build_forward_expand(&gf, probs);
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-
-    //ggml_graph_print(&gf);
-    ggml_graph_dump_dot(&gf, NULL, "mnist-cnn.dot");
-
-    if (fname_cgraph) {
-        // export the compute graph for later use
-        // see the "mnist-cpu" example
-        ggml_graph_export(&gf, fname_cgraph);
-
-        fprintf(stderr, "%s: exported compute graph to '%s'\n", __func__, fname_cgraph);
-    }
-
-    const float * probs_data = ggml_get_data_f32(probs);
-    const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
-    ggml_free(ctx0);
-    return prediction;
-}
-
-int main(int argc, char ** argv) {
-    srand(time(NULL));
-    ggml_time_init();
-
-    if (argc != 3) {
-        fprintf(stderr, "Usage: %s models/mnist/mnist-cnn.gguf models/mnist/t10k-images.idx3-ubyte\n", argv[0]);
-        exit(0);
-    }
-
-    uint8_t buf[784];
-    mnist_model model;
-    std::vector<float> digit;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!mnist_model_load(argv[1], model)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, argv[1]);
-            return 1;
-        }
-
-        const int64_t t_load_us = ggml_time_us() - t_start_us;
-
-        fprintf(stdout, "%s: loaded model in %8.2f ms\n", __func__, t_load_us / 1000.0f);
-    }
-
-    // read a random digit from the test set
-    {
-        std::ifstream fin(argv[2], std::ios::binary);
-        if (!fin) {
-            fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]);
-            return 1;
-        }
-
-        // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
-        fin.seekg(16 + 784 * (rand() % 10000));
-        fin.read((char *) &buf, sizeof(buf));
-    }
-
-    // render the digit in ASCII
-    {
-        digit.resize(sizeof(buf));
-
-        for (int row = 0; row < 28; row++) {
-            for (int col = 0; col < 28; col++) {
-                fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_');
-                digit[row*28 + col] = ((float)buf[row*28 + col] / 255.0f);
-            }
-
-            fprintf(stderr, "\n");
-        }
-
-        fprintf(stderr, "\n");
-    }
-
-    const int prediction = mnist_eval(model, 1, digit, nullptr);
-    fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction);
-    ggml_free(model.ctx);
-    return 0;
-}

+ 0 - 122
ggml/examples/mnist/main-cpu.cpp

@@ -1,122 +0,0 @@
-// Use a pre-generated MNIST compute graph for inference on the CPU
-//
-// You can generate a compute graph using the "mnist" tool:
-//
-// $ ./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
-//
-// This command creates the "mnist.ggml" file, which contains the generated compute graph.
-// Now, you can re-use the compute graph with the "mnist-cpu" tool:
-//
-// $ ./bin/mnist-cpu ./models/mnist/mnist.ggml ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
-//
-
-#include "ggml/ggml.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// evaluate the MNIST compute graph
-//
-//   - fname_cgraph: path to the compute graph
-//   - n_threads:    number of threads to use
-//   - digit:        784 pixel values
-//
-// returns 0 - 9 prediction
-int mnist_eval(
-        const char * fname_cgraph,
-        const int n_threads,
-        std::vector<float> digit) {
-    // load the compute graph
-    struct ggml_context * ctx_data = NULL;
-    struct ggml_context * ctx_eval = NULL;
-
-    struct ggml_cgraph gfi = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
-
-    // param export/import test
-    GGML_ASSERT(ggml_graph_get_tensor(&gfi, "fc1_bias")->op_params[0] == int(0xdeadbeef));
-
-    // allocate work context
-    // needed during ggml_graph_compute() to allocate a work tensor
-    static size_t buf_size = 128ull*1024*1024; // TODO
-    static void * buf = malloc(buf_size);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx_work = ggml_init(params);
-
-    struct ggml_tensor * input = ggml_graph_get_tensor(&gfi, "input");
-    memcpy(input->data, digit.data(), ggml_nbytes(input));
-
-    ggml_graph_compute_with_ctx(ctx_work, &gfi, n_threads);
-
-    const float * probs_data = ggml_get_data_f32(ggml_graph_get_tensor(&gfi, "probs"));
-
-    const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
-
-    ggml_free(ctx_work);
-    ggml_free(ctx_data);
-    ggml_free(ctx_eval);
-
-    return prediction;
-}
-
-int main(int argc, char ** argv) {
-    srand(time(NULL));
-    ggml_time_init();
-
-    if (argc != 3) {
-        fprintf(stderr, "Usage: %s models/mnist/mnist.ggml models/mnist/t10k-images.idx3-ubyte\n", argv[0]);
-        exit(0);
-    }
-
-    uint8_t buf[784];
-    std::vector<float> digit;
-
-    // read a random digit from the test set
-    {
-        std::ifstream fin(argv[2], std::ios::binary);
-        if (!fin) {
-            fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]);
-            return 1;
-        }
-
-        // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
-        fin.seekg(16 + 784 * (rand() % 10000));
-        fin.read((char *) &buf, sizeof(buf));
-    }
-
-    // render the digit in ASCII
-    {
-        digit.resize(sizeof(buf));
-
-        for (int row = 0; row < 28; row++) {
-            for (int col = 0; col < 28; col++) {
-                fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_');
-                digit[row*28 + col] = ((float)buf[row*28 + col]);
-            }
-
-            fprintf(stderr, "\n");
-        }
-
-        fprintf(stderr, "\n");
-    }
-
-    const int prediction = mnist_eval(argv[1], 1, digit);
-
-    fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction);
-
-    return 0;
-}

+ 0 - 125
ggml/examples/mnist/main-mtl.cpp

@@ -1,125 +0,0 @@
-// Use a pre-generated MNIST compute graph for inference on the M1 GPU via MPS
-//
-// You can generate a compute graph using the "mnist" tool:
-//
-// $ ./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
-//
-// This command creates the "mnist.ggml" file, which contains the generated compute graph.
-// Now, you can re-use the compute graph on the GPU with the "mnist-mtl" tool:
-//
-// $ ./bin/mnist-mtl ./models/mnist/mnist.ggml ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
-//
-
-#include "ggml/ggml.h"
-
-#include "main-mtl.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <vector>
-
-// evaluate the MNIST compute graph
-//
-//   - fname_cgraph: path to the compute graph
-//   - digit:        784 pixel values
-//
-// returns 0 - 9 prediction
-int mnist_eval(
-        const char * fname_cgraph,
-        std::vector<float> digit
-        ) {
-    // load the compute graph
-    struct ggml_context * ctx_data = NULL;
-    struct ggml_context * ctx_eval = NULL;
-
-    struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
-
-    // allocate work context
-    static size_t buf_size = 128ull*1024*1024; // TODO
-    static void * buf = malloc(buf_size);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx_work = ggml_init(params);
-
-    // this allocates all Metal resources and memory buffers
-    auto ctx_mtl = mnist_mtl_init(ctx_data, ctx_eval, ctx_work, &gf);
-
-    int prediction = -1;
-
-    for (int i = 0; i < 1; ++i) {
-        struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "input");
-
-        if (i % 2 == 0) {
-            memcpy(input->data, digit.data(), ggml_nbytes(input));
-        } else {
-            memset(input->data, 0, ggml_nbytes(input));
-        }
-
-        // the actual inference happens here
-        prediction = mnist_mtl_eval(ctx_mtl, &gf);
-    }
-
-    mnist_mtl_free(ctx_mtl);
-
-    ggml_free(ctx_work);
-    ggml_free(ctx_data);
-    ggml_free(ctx_eval);
-
-    return prediction;
-}
-
-int main(int argc, char ** argv) {
-    srand(time(NULL));
-    ggml_time_init();
-
-    if (argc != 3) {
-        fprintf(stderr, "Usage: %s models/mnist/mnist.ggml models/mnist/t10k-images.idx3-ubyte\n", argv[0]);
-        exit(0);
-    }
-
-    uint8_t buf[784];
-    std::vector<float> digit;
-
-    // read a random digit from the test set
-    {
-        std::ifstream fin(argv[2], std::ios::binary);
-        if (!fin) {
-            fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]);
-            return 1;
-        }
-
-        // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
-        fin.seekg(16 + 784 * (rand() % 10000));
-        fin.read((char *) &buf, sizeof(buf));
-    }
-
-    // render the digit in ASCII
-    {
-        digit.resize(sizeof(buf));
-
-        for (int row = 0; row < 28; row++) {
-            for (int col = 0; col < 28; col++) {
-                fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_');
-                digit[row*28 + col] = ((float)buf[row*28 + col]);
-            }
-
-            fprintf(stderr, "\n");
-        }
-
-        fprintf(stderr, "\n");
-    }
-
-    const int prediction = mnist_eval(argv[1], digit);
-
-    fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction);
-
-    return 0;
-}

+ 0 - 26
ggml/examples/mnist/main-mtl.h

@@ -1,26 +0,0 @@
-#pragma once
-
-struct ggml_context;
-struct ggml_cgraph;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct ggml_mtl_context;
-
-struct ggml_mtl_context * mnist_mtl_init(
-        struct ggml_context * ctx_data,
-        struct ggml_context * ctx_eval,
-        struct ggml_context * ctx_work,
-        struct ggml_cgraph  * gf);
-
-void mnist_mtl_free(struct ggml_mtl_context * ctx);
-
-int mnist_mtl_eval(
-        struct ggml_mtl_context * ctx,
-        struct ggml_cgraph      * gf);
-
-#ifdef __cplusplus
-}
-#endif

+ 0 - 499
ggml/examples/mnist/main-mtl.m

@@ -1,499 +0,0 @@
-#import "main-mtl.h"
-
-#import "ggml/ggml.h"
-
-#import <Foundation/Foundation.h>
-#import <Metal/Metal.h>
-#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
-
-// TODO: couldn't get this to work
-//#define GGML_MTL_HEAP
-
-struct ggml_mtl_context {
-    struct ggml_context * ctx_data;
-    struct ggml_context * ctx_eval;
-    struct ggml_context * ctx_work;
-
-    id<MTLDevice>       device;
-    id<MTLCommandQueue> queue;
-    id<MTLLibrary>      library;
-
-#ifdef GGML_MTL_HEAP
-    id<MTLHeap> heap_data;
-    id<MTLHeap> heap_eval;
-#else
-    id<MTLBuffer> buffer_data;
-    id<MTLBuffer> buffer_eval;
-#endif
-
-    id<MTLBuffer> out;
-
-    // custom kernels
-    id<MTLFunction>             function_add;
-    id<MTLComputePipelineState> pipeline_add;
-
-    id<MTLFunction>             function_relu;
-    id<MTLComputePipelineState> pipeline_relu;
-
-    id<MTLFunction>             function_soft_max;
-    id<MTLComputePipelineState> pipeline_soft_max;
-};
-
-// MSL code
-NSString * const msl_library_mnist = @"\
-#include <metal_stdlib>                                                                 \n\
-using namespace metal;                                                                  \n\
-                                                                                        \n\
-#define MAX(x, y) ((x) > (y) ? (x) : (y))                                               \n\
-                                                                                        \n\
-constant int k_digits [[function_constant(0)]];                                         \n\
-                                                                                        \n\
-kernel void kernel_add(                                                                 \n\
-        device const float * src0,                                                      \n\
-        device const float * src1,                                                      \n\
-        device float * dst,                                                             \n\
-        uint gid[[thread_position_in_grid]]) {                                          \n\
-    dst[gid] = src0[gid] + src1[gid];                                                   \n\
-}                                                                                       \n\
-                                                                                        \n\
-kernel void kernel_relu(                                                                \n\
-        device const float * src,                                                       \n\
-        device       float * dst,                                                       \n\
-        uint gid[[thread_position_in_grid]]) {                                          \n\
-    dst[gid] = max(0.0f, src[gid]);                                                     \n\
-}                                                                                       \n\
-                                                                                        \n\
-kernel void kernel_soft_max(                                                            \n\
-        device const float * src,                                                       \n\
-        device       float * dst,                                                       \n\
-        uint gid[[thread_position_in_grid]]) {                                          \n\
-    float max = 0.0f;                                                                   \n\
-    for (int i = 0; i < k_digits; i++) {                                                \n\
-        max = MAX(max, src[i]);                                                         \n\
-    }                                                                                   \n\
-    float sum = 0.0f;                                                                   \n\
-    for (int i = 0; i < k_digits; i++) {                                                \n\
-        dst[i] = exp(src[i] - max);                                                     \n\
-        sum += dst[i];                                                                  \n\
-    }                                                                                   \n\
-    for (int i = 0; i < k_digits; i++) {                                                \n\
-        dst[i] /= sum;                                                                  \n\
-    }                                                                                   \n\
-}                                                                                       \n\
-";
-
-struct ggml_mtl_context * mnist_mtl_init(
-    struct ggml_context * ctx_data,
-    struct ggml_context * ctx_eval,
-    struct ggml_context * ctx_work,
-    struct ggml_cgraph  * gf) {
-    fprintf(stderr, "%s: allocating\n", __func__);
-
-    struct ggml_mtl_context * ctx = malloc(sizeof(struct ggml_mtl_context));
-
-    ctx->ctx_data = ctx_data;
-    ctx->ctx_eval = ctx_eval;
-    ctx->ctx_work = ctx_work;
-
-    ctx->device = MTLCreateSystemDefaultDevice();
-    ctx->queue  = [ctx->device newCommandQueue];
-
-    // determine if we can use MPS
-    if (MPSSupportsMTLDevice(ctx->device)) {
-        fprintf(stderr, "%s: using MPS\n", __func__);
-    } else {
-        fprintf(stderr, "%s: not using MPS\n", __func__);
-        GGML_ASSERT(false && "MPS not supported");
-    }
-
-    // compile from source string and show compile log
-    {
-        NSError * error = nil;
-        ctx->library = [ctx->device newLibraryWithSource:msl_library_mnist options:nil error:&error];
-        if (error) {
-            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
-            exit(1);
-        }
-    }
-
-    // load kernels
-    {
-        const int k_digits = ggml_graph_get_tensor(gf, "probs")->ne[0];
-
-        MTLFunctionConstantValues * constants = [MTLFunctionConstantValues new];
-        [constants setConstantValue:&k_digits type:MTLDataTypeInt withName:@"k_digits"];
-
-        ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"];
-        ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil];
-        fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, ctx->pipeline_add);
-
-        ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"];
-        ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil];
-        fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, ctx->pipeline_relu);
-
-        ctx->function_soft_max = [ctx->library newFunctionWithName:@"kernel_soft_max" constantValues:constants error:nil];
-        ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil];
-        fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, ctx->pipeline_soft_max);
-    }
-
-#ifdef GGML_MTL_HEAP
-    // MTLHeap approach
-
-    // pin ctx_data memory to GPU
-    // use MTLStorageModeShared to allow us to initialize the weights from the CPU
-    // TODO: how to use MTLStorageModeManaged?
-    // TODO: see if we can avoid this copy somehow
-    {
-        const void * mem_buffer = ggml_get_mem_buffer(ctx_data);
-        const size_t mem_size   = ggml_get_mem_size(ctx_data);
-
-        MTLHeapDescriptor * heap_desc = [MTLHeapDescriptor new];
-        heap_desc.storageMode = MTLStorageModeShared;
-        heap_desc.size        = mem_size;
-
-        printf("heap_desc.size = %zu\n", mem_size);
-
-        ctx->heap_data = [ctx->device newHeapWithDescriptor:heap_desc];
-        [ctx->heap_data setPurgeableState:MTLPurgeableStateNonVolatile]; // TODO: is this needed?
-        ctx->heap_data.label = @"heap_data";
-
-        printf("ctx->heap_data.size = %zu\n", [ctx->heap_data size]);
-
-        id<MTLBuffer> buffer = [ctx->heap_data newBufferWithLength:mem_size options:MTLResourceStorageModeShared];
-        if (!buffer) {
-            fprintf(stderr, "%s: error: failed to allocate buffer\n", __func__);
-            exit(1);
-        }
-
-        // copy data from CPU to GPU
-        memcpy([buffer contents], mem_buffer, mem_size);
-
-        fprintf(stderr, "%s: allocated data heap, size = %zu\n", __func__, mem_size);
-    }
-
-    // pin ctx_eval memory to GPU
-    // this heap will be used for the intermediate results of the evaluation
-    {
-        const size_t mem_size = ggml_get_mem_size(ctx_eval);
-
-        MTLHeapDescriptor * heap_desc = [MTLHeapDescriptor new];
-        heap_desc.storageMode = MTLStorageModePrivate; // GPU only
-        heap_desc.size        = mem_size;
-
-        ctx->heap_eval = [ctx->device newHeapWithDescriptor:heap_desc];
-        [ctx->heap_eval setPurgeableState:MTLPurgeableStateNonVolatile]; // TODO: is this needed?
-
-        fprintf(stderr, "%s: allocated eval heap, size = %zu\n", __func__, mem_size);
-    }
-#else
-    // MTLBuffer approach
-
-    // pin ctx_data memory to GPU
-    // use MTLStorageModeShared to allow us to initialize the weights from the CPU
-    // TODO: how to use MTLStorageModeManaged?
-    // TODO: see if we can avoid this copy somehow
-    {
-        const void * mem_buffer = ggml_get_mem_buffer(ctx_data);
-        const size_t mem_size   = ggml_get_mem_size(ctx_data);
-
-        ctx->buffer_data = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared];
-
-        fprintf(stderr, "%s: allocated data buffer, size = %zu\n", __func__, mem_size);
-    }
-
-    // pin ctx_eval memory to GPU
-    // this buffer will be used for the intermediate results of the evaluation
-    {
-        const size_t mem_size = ggml_get_mem_size(ctx_eval);
-
-        ctx->buffer_eval = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModePrivate];
-
-        fprintf(stderr, "%s: allocated eval buffer, size = %zu\n", __func__, mem_size);
-    }
-#endif
-
-    // allocate buffer for result extraction
-    {
-        const size_t mem_size = ggml_nbytes(gf->nodes[gf->n_nodes - 1]);
-
-        ctx->out = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModeShared];
-
-        fprintf(stderr, "%s: allocated out buffer, size = %zu\n", __func__, mem_size);
-    }
-
-    return ctx;
-}
-
-void mnist_mtl_free(struct ggml_mtl_context * ctx) {
-    fprintf(stderr, "%s: deallocating\n", __func__);
-
-    free(ctx);
-}
-
-#ifdef GGML_MTL_HEAP
-
-// make a view of the respective MTL heap
-id<MTLBuffer> mnist_mtl_get_buffer_on_heap(struct ggml_mtl_context * ctx, struct ggml_tensor * t) {
-    const int64_t offs_data = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_data);
-    const int64_t offs_eval = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_eval);
-
-    const bool is_data = (offs_eval < 0) || (offs_data >= 0 && offs_data < offs_eval);
-
-    const size_t t_size = ggml_nbytes(t);
-    const size_t t_offs = is_data ? offs_data : offs_eval;
-
-    id<MTLBuffer> result;
-
-    if (is_data) {
-        fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
-        result = [ctx->heap_data newBufferWithLength:t_size options:MTLResourceStorageModeShared offset:t_offs];
-    } else {
-        fprintf(stderr, "%s: eval tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
-        result = [ctx->heap_eval newBufferWithLength:t_size options:MTLResourceStorageModePrivate offset:t_offs];
-    }
-
-    if (result == nil) {
-        fprintf(stderr, "%s: error: buffer is nil\n", __func__);
-        GGML_ASSERT(false);
-    }
-
-    return result;
-}
-
-#else
-
-// get data / eval buffer + offset
-id<MTLBuffer> mnist_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
-    const int64_t offs_data = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_data);
-    const int64_t offs_eval = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_eval);
-
-    const bool is_data = (offs_eval < 0) || (offs_data >= 0 && offs_data < offs_eval);
-
-    const size_t t_size = ggml_nbytes(t);
-    const size_t t_offs = is_data ? offs_data : offs_eval;
-
-    id<MTLBuffer> result;
-
-    if (is_data) {
-        fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
-        result = ctx->buffer_data;
-    } else {
-        fprintf(stderr, "%s: eval tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
-        result = ctx->buffer_eval;
-    }
-
-    if (result == nil) {
-        fprintf(stderr, "%s: error: buffer is nil\n", __func__);
-        GGML_ASSERT(false);
-    }
-
-    if (offs != nil) {
-        *offs = t_offs;
-    }
-
-    return result;
-}
-
-#endif
-
-int mnist_mtl_eval(
-        struct ggml_mtl_context * ctx,
-        struct ggml_cgraph      * gf) {
-    fprintf(stderr, "%s: evaluating\n", __func__);
-
-    id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBuffer];
-    id<MTLComputeCommandEncoder> encoder = nil;
-
-    size_t offs_src0;
-    size_t offs_src1;
-    size_t offs_dst;
-
-    // copy the input data to the GPU
-    {
-        struct ggml_tensor * inp = ggml_graph_get_tensor(gf, "input");
-
-        id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, inp, &offs_src0);
-
-        memcpy(id_dst.contents + offs_src0, inp->data, ggml_nbytes(inp));
-    }
-
-    for (int i = 0; i < gf->n_nodes; ++i) {
-        fprintf(stderr, "%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
-
-        switch (gf->nodes[i]->op) {
-            case GGML_OP_ADD:
-                {
-                    if (encoder == nil) {
-                        encoder = [command_buffer computeCommandEncoder];
-                    }
-
-                    id<MTLBuffer> id_src0 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src[0], &offs_src0);
-                    id<MTLBuffer> id_src1 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src[1], &offs_src1);
-                    id<MTLBuffer> id_dst  = mnist_mtl_get_buffer(ctx, gf->nodes[i],         &offs_dst);
-
-                    [encoder setComputePipelineState:ctx->pipeline_add];
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-
-                    const int64_t n = ggml_nelements(gf->nodes[i]);
-
-                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                } break;
-            case GGML_OP_UNARY:
-                switch (ggml_get_unary_op(gf->nodes[i])) {
-                    case GGML_UNARY_OP_RELU:
-                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
-                            }
-
-                            id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src[0], &offs_src0);
-                            id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
-
-                            [encoder setComputePipelineState:ctx->pipeline_relu];
-                            [encoder setBuffer:id_src offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst offset:offs_dst  atIndex:1];
-
-                            const int64_t n = ggml_nelements(gf->nodes[i]);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        } break;
-                    default:
-                        {
-                            fprintf(stderr, "%s: node %3d, op = %8s, unary op %d not implemented\n", __func__, i, ggml_op_name(gf->nodes[i]->op), (int) ggml_get_unary_op(gf->nodes[i]));
-                            GGML_ASSERT(false);
-                            return -1;
-                        }
-                        break;
-                } break;
-            case GGML_OP_SOFT_MAX:
-                {
-#if 0
-                    // NOTE: MPSMatrixSoftMax is not working properly, probably there is a bug
-
-                    if (encoder != nil) {
-                        [encoder endEncoding];
-                        encoder = nil;
-                    }
-
-                    // use MPSMatrixSoftMax
-                    id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
-                    id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
-
-                    MPSMatrixDescriptor * desc = [MPSMatrixDescriptor
-                        matrixDescriptorWithRows:1 columns:gf->nodes[i]->ne[0] rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32];
-
-                    MPSMatrix * mat_src = [[MPSMatrix alloc] initWithBuffer:id_src offset:offs_src0 descriptor:desc];
-                    MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst offset:offs_dst  descriptor:desc];
-
-                    MPSMatrixSoftMax * softmax = [[MPSMatrixSoftMax alloc] initWithDevice:ctx->device];
-
-                    [softmax encodeToCommandBuffer:command_buffer inputMatrix:mat_src resultMatrix:mat_dst];
-#else
-                    if (encoder == nil) {
-                        encoder = [command_buffer computeCommandEncoder];
-                    }
-
-                    id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src[0], &offs_src0);
-                    id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
-
-                    [encoder setComputePipelineState:ctx->pipeline_soft_max];
-                    [encoder setBuffer:id_src offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst offset:offs_dst  atIndex:1];
-
-                    [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-#endif
-                } break;
-            case GGML_OP_MUL_MAT:
-                {
-                    if (encoder != nil) {
-                        [encoder endEncoding];
-                        encoder = nil;
-                    }
-
-                    // use MPSMatrixMultiplication
-                    id<MTLBuffer> id_src0 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src[0], &offs_src0);
-                    id<MTLBuffer> id_src1 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src[1], &offs_src1);
-                    id<MTLBuffer> id_dst  = mnist_mtl_get_buffer(ctx, gf->nodes[i],         &offs_dst);
-
-                    const int64_t ncols0 = gf->nodes[i]->src[0]->ne[0];
-                    const int64_t nrows0 = gf->nodes[i]->src[0]->ne[1];
-
-                    const int64_t ncols1 = gf->nodes[i]->src[1]->ne[0];
-                    const int64_t nrows1 = gf->nodes[i]->src[1]->ne[1];
-
-                    const int64_t ncols2 = gf->nodes[i]->ne[0];
-                    const int64_t nrows2 = gf->nodes[i]->ne[1];
-
-                    GGML_ASSERT(ncols0 == ncols1);
-
-                    MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
-                        matrixDescriptorWithRows:nrows0 columns:ncols0 rowBytes:gf->nodes[i]->src[0]->nb[1] dataType:MPSDataTypeFloat32];
-                    MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
-                        matrixDescriptorWithRows:nrows1 columns:ncols1 rowBytes:gf->nodes[i]->src[1]->nb[1] dataType:MPSDataTypeFloat32];
-                    MPSMatrixDescriptor * desc2 = [MPSMatrixDescriptor
-                        matrixDescriptorWithRows:nrows2 columns:ncols2 rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32];
-
-                    MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0 descriptor:desc0];
-                    MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1 descriptor:desc1];
-                    MPSMatrix * mat_dst  = [[MPSMatrix alloc] initWithBuffer:id_dst  offset:offs_dst  descriptor:desc2];
-
-                    MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc] initWithDevice:ctx->device
-                        transposeLeft:false transposeRight:true resultRows:nrows1 resultColumns:nrows0 interiorColumns:ncols0 alpha:1.0 beta:0.0];
-
-                    [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
-                } break;
-            default:
-                {
-                    fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
-                    GGML_ASSERT(false);
-                    return -1;
-                }
-        }
-    }
-
-    // extract results from the GPU
-    {
-        if (encoder != nil) {
-            [encoder endEncoding];
-            encoder = nil;
-        }
-
-        struct ggml_tensor * out = gf->nodes[gf->n_nodes - 1];
-
-        id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, out, &offs_src0);
-        id<MTLBuffer> id_dst = ctx->out;
-
-        id<MTLBlitCommandEncoder> encoder_blit = [command_buffer blitCommandEncoder];
-        [encoder_blit copyFromBuffer:id_src sourceOffset:offs_src0 toBuffer:id_dst destinationOffset:0 size:ggml_nbytes(out)];
-        [encoder_blit endEncoding];
-    }
-
-    [command_buffer commit];
-    [command_buffer waitUntilCompleted];
-
-    {
-        const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime];
-        fprintf(stderr, "%s: time elapsed = %f\n", __func__, time_elapsed);
-    }
-
-    // select the most probable digit
-    int result = -1;
-    {
-        const float * probs = ctx->out.contents;
-
-        float prob = probs[0];
-
-        for (int i = 0; i < 10; ++i) {
-            fprintf(stderr, "%s: probs[%2d] = %f\n", __func__, i, probs[i]);
-
-            if (probs[i] > prob) {
-                result = i;
-                prob = probs[i];
-            }
-        }
-    }
-
-    return result;
-}

+ 0 - 328
ggml/examples/mnist/main.cpp

@@ -1,328 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <string>
-#include <vector>
-#include <algorithm>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// default hparams
-struct mnist_hparams {
-    int32_t n_input   = 784;
-    int32_t n_hidden  = 500;
-    int32_t n_classes = 10;
-};
-
-struct mnist_model {
-    mnist_hparams hparams;
-
-    struct ggml_tensor * fc1_weight;
-    struct ggml_tensor * fc1_bias;
-
-    struct ggml_tensor * fc2_weight;
-    struct ggml_tensor * fc2_bias;
-
-    struct ggml_context * ctx;
-};
-
-// load the model's weights from a file
-bool mnist_model_load(const std::string & fname, mnist_model & model) {
-    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_input   = hparams.n_input;
-        const int n_hidden  = hparams.n_hidden;
-        const int n_classes = hparams.n_classes;
-
-        ctx_size += n_input * n_hidden * ggml_type_sizef(GGML_TYPE_F32); // fc1 weight
-        ctx_size +=           n_hidden * ggml_type_sizef(GGML_TYPE_F32); // fc1 bias
-
-        ctx_size += n_hidden * n_classes * ggml_type_sizef(GGML_TYPE_F32); // fc2 weight
-        ctx_size +=            n_classes * ggml_type_sizef(GGML_TYPE_F32); // fc2 bias
-
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size + 1024*1024,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // Read FC1 layer 1
-    {
-        // Read dimensions
-        int32_t n_dims;
-        fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-
-        {
-            int32_t ne_weight[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne_weight[i]), sizeof(ne_weight[i]));
-            }
-
-            // FC1 dimensions taken from file, eg. 768x500
-            model.hparams.n_input  = ne_weight[0];
-            model.hparams.n_hidden = ne_weight[1];
-
-            model.fc1_weight = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, model.hparams.n_input, model.hparams.n_hidden);
-            fin.read(reinterpret_cast<char *>(model.fc1_weight->data), ggml_nbytes(model.fc1_weight));
-            ggml_set_name(model.fc1_weight, "fc1_weight");
-        }
-
-        {
-            int32_t ne_bias[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne_bias[i]), sizeof(ne_bias[i]));
-            }
-
-            model.fc1_bias = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_hidden);
-            fin.read(reinterpret_cast<char *>(model.fc1_bias->data), ggml_nbytes(model.fc1_bias));
-            ggml_set_name(model.fc1_bias, "fc1_bias");
-
-            // just for testing purposes, set some parameters to non-zero
-            model.fc1_bias->op_params[0] = 0xdeadbeef;
-        }
-    }
-
-    // Read FC2 layer 2
-    {
-        // Read dimensions
-        int32_t n_dims;
-        fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-
-        {
-            int32_t ne_weight[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne_weight[i]), sizeof(ne_weight[i]));
-            }
-
-            // FC1 dimensions taken from file, eg. 10x500
-            model.hparams.n_classes = ne_weight[1];
-
-            model.fc2_weight = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, model.hparams.n_hidden, model.hparams.n_classes);
-            fin.read(reinterpret_cast<char *>(model.fc2_weight->data), ggml_nbytes(model.fc2_weight));
-            ggml_set_name(model.fc2_weight, "fc2_weight");
-        }
-
-        {
-            int32_t ne_bias[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne_bias[i]), sizeof(ne_bias[i]));
-            }
-
-            model.fc2_bias = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_classes);
-            fin.read(reinterpret_cast<char *>(model.fc2_bias->data), ggml_nbytes(model.fc2_bias));
-            ggml_set_name(model.fc2_bias, "fc2_bias");
-        }
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// evaluate the model
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - digit:     784 pixel values
-//
-// returns 0 - 9 prediction
-int mnist_eval(
-        const mnist_model & model,
-        const int n_threads,
-        std::vector<float> digit,
-        const char * fname_cgraph
-        ) {
-
-    const auto & hparams = model.hparams;
-
-    static size_t buf_size = hparams.n_input * sizeof(float) * 4;
-    static void * buf = malloc(buf_size);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = {};
-
-    struct ggml_tensor * input = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hparams.n_input);
-    memcpy(input->data, digit.data(), ggml_nbytes(input));
-    ggml_set_name(input, "input");
-
-    // fc1 MLP = Ax + b
-    ggml_tensor * fc1 = ggml_add(ctx0, ggml_mul_mat(ctx0, model.fc1_weight, input),                model.fc1_bias);
-    ggml_tensor * fc2 = ggml_add(ctx0, ggml_mul_mat(ctx0, model.fc2_weight, ggml_relu(ctx0, fc1)), model.fc2_bias);
-
-    // soft max
-    ggml_tensor * probs = ggml_soft_max(ctx0, fc2);
-    ggml_set_name(probs, "probs");
-
-    // build / export / run the computation graph
-    ggml_build_forward_expand(&gf, probs);
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-
-    //ggml_graph_print   (&gf);
-    ggml_graph_dump_dot(&gf, NULL, "mnist.dot");
-
-    if (fname_cgraph) {
-        // export the compute graph for later use
-        // see the "mnist-cpu" example
-        ggml_graph_export(&gf, "mnist.ggml");
-
-        fprintf(stderr, "%s: exported compute graph to '%s'\n", __func__, fname_cgraph);
-    }
-
-    const float * probs_data = ggml_get_data_f32(probs);
-
-    const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
-
-    ggml_free(ctx0);
-
-    return prediction;
-}
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int wasm_eval(uint8_t * digitPtr) {
-    mnist_model model;
-    if (!mnist_model_load("models/mnist/ggml-model-f32.bin", model)) {
-        fprintf(stderr, "error loading model\n");
-        return -1;
-    }
-    std::vector<float> digit(digitPtr, digitPtr + 784);
-    int result = mnist_eval(model, 1, digit, nullptr);
-    ggml_free(model.ctx);
-
-    return result;
-}
-
-int wasm_random_digit(char * digitPtr) {
-    auto fin = std::ifstream("models/mnist/t10k-images.idx3-ubyte", std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "failed to open digits file\n");
-        return 0;
-    }
-    srand(time(NULL));
-
-    // Seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
-    fin.seekg(16 + 784 * (rand() % 10000));
-    fin.read(digitPtr, 784);
-
-    return 1;
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-int main(int argc, char ** argv) {
-    srand(time(NULL));
-    ggml_time_init();
-
-    if (argc != 3) {
-        fprintf(stderr, "Usage: %s models/mnist/ggml-model-f32.bin models/mnist/t10k-images.idx3-ubyte\n", argv[0]);
-        exit(0);
-    }
-
-    uint8_t buf[784];
-    mnist_model model;
-    std::vector<float> digit;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!mnist_model_load(argv[1], model)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, "models/ggml-model-f32.bin");
-            return 1;
-        }
-
-        const int64_t t_load_us = ggml_time_us() - t_start_us;
-
-        fprintf(stdout, "%s: loaded model in %8.2f ms\n", __func__, t_load_us / 1000.0f);
-    }
-
-    // read a random digit from the test set
-    {
-        std::ifstream fin(argv[2], std::ios::binary);
-        if (!fin) {
-            fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]);
-            return 1;
-        }
-
-        // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
-        fin.seekg(16 + 784 * (rand() % 10000));
-        fin.read((char *) &buf, sizeof(buf));
-    }
-
-    // render the digit in ASCII
-    {
-        digit.resize(sizeof(buf));
-
-        for (int row = 0; row < 28; row++) {
-            for (int col = 0; col < 28; col++) {
-                fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_');
-                digit[row*28 + col] = ((float)buf[row*28 + col]);
-            }
-
-            fprintf(stderr, "\n");
-        }
-
-        fprintf(stderr, "\n");
-    }
-
-    const int prediction = mnist_eval(model, 1, digit, "mnist.ggml");
-
-    fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction);
-
-    ggml_free(model.ctx);
-
-    return 0;
-}

+ 0 - 101
ggml/examples/mnist/mnist-cnn.py

@@ -1,101 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import gguf
-import numpy as np
-from tensorflow import keras
-from tensorflow.keras import layers
-
-def train(model_name):
-    # Model / data parameters
-    num_classes = 10
-    input_shape = (28, 28, 1)
-
-    # Load the data and split it between train and test sets
-    (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
-
-    # Scale images to the [0, 1] range
-    x_train = x_train.astype("float32") / 255
-    x_test = x_test.astype("float32") / 255
-    # Make sure images have shape (28, 28, 1)
-    x_train = np.expand_dims(x_train, -1)
-    x_test = np.expand_dims(x_test, -1)
-    print("x_train shape:", x_train.shape)
-    print(x_train.shape[0], "train samples")
-    print(x_test.shape[0], "test samples")
-
-    # convert class vectors to binary class matrices
-    y_train = keras.utils.to_categorical(y_train, num_classes)
-    y_test = keras.utils.to_categorical(y_test, num_classes)
-
-    model = keras.Sequential(
-        [
-            keras.Input(shape=input_shape),
-            layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
-            layers.MaxPooling2D(pool_size=(2, 2)),
-            layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
-            layers.MaxPooling2D(pool_size=(2, 2)),
-            layers.Flatten(),
-            layers.Dropout(0.5),
-            layers.Dense(num_classes, activation="softmax"),
-        ]
-    )
-
-    model.summary()
-    batch_size = 128
-    epochs = 15
-    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
-    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)
-
-    score = model.evaluate(x_test, y_test, verbose=0)
-    print("Test loss:", score[0])
-    print("Test accuracy:", score[1])
-    model.save(model_name)
-    print("Keras model saved to '" + model_name + "'")
-
-def convert(model_name):
-    model = keras.models.load_model(model_name)
-    gguf_model_name = model_name + ".gguf"
-    gguf_writer = gguf.GGUFWriter(gguf_model_name, "mnist-cnn")
-
-    kernel1 = model.layers[0].weights[0].numpy()
-    kernel1 = np.moveaxis(kernel1, [2,3], [0,1])
-    kernel1 = kernel1.astype(np.float16)
-    gguf_writer.add_tensor("kernel1", kernel1, raw_shape=(32, 1, 3, 3))
-
-    bias1 = model.layers[0].weights[1].numpy()
-    bias1 = np.repeat(bias1, 26*26)
-    gguf_writer.add_tensor("bias1", bias1, raw_shape=(1, 32, 26, 26))
-
-    kernel2 = model.layers[2].weights[0].numpy()
-    kernel2 = np.moveaxis(kernel2, [0,1,2,3], [2,3,1,0])
-    kernel2 = kernel2.astype(np.float16)
-    gguf_writer.add_tensor("kernel2", kernel2, raw_shape=(64, 32, 3, 3))
-
-    bias2 = model.layers[2].weights[1].numpy()
-    bias2 = np.repeat(bias2, 11*11)
-    gguf_writer.add_tensor("bias2", bias2, raw_shape=(1, 64, 11, 11))
-
-    dense_w = model.layers[-1].weights[0].numpy()
-    dense_w = dense_w.transpose()
-    gguf_writer.add_tensor("dense_w", dense_w, raw_shape=(10, 1600))
-
-    dense_b = model.layers[-1].weights[1].numpy()
-    gguf_writer.add_tensor("dense_b", dense_b)
-
-    gguf_writer.write_header_to_file()
-    gguf_writer.write_kv_data_to_file()
-    gguf_writer.write_tensors_to_file()
-    gguf_writer.close()
-    print("Model converted and saved to '{}'".format(gguf_model_name))
-
-if __name__ == '__main__':
-    if len(sys.argv) < 3:
-        print("Usage: %s <train|convert> <model_name>".format(sys.argv[0]))
-        sys.exit(1)
-    if sys.argv[1] == 'train':
-        train(sys.argv[2])
-    elif sys.argv[1] == 'convert':
-        convert(sys.argv[2])
-    else:
-        print("Usage: %s <train|convert> <model_name>".format(sys.argv[0]))
-        sys.exit(1)

+ 0 - 1
ggml/examples/mnist/models/mnist/.gitignore

@@ -1 +0,0 @@
-ggml-model-f32.bin

BIN
ggml/examples/mnist/models/mnist/mnist_model.state_dict


BIN
ggml/examples/mnist/models/mnist/t10k-images.idx3-ubyte


+ 0 - 13
ggml/examples/mpt/CMakeLists.txt

@@ -1,13 +0,0 @@
-#
-# mpt
-
-set(TEST_TARGET mpt)
-add_executable(${TEST_TARGET} main.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
-#
-# mpt-quantize
-
-set(TEST_TARGET mpt-quantize)
-add_executable(${TEST_TARGET} quantize.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)

+ 0 - 27
ggml/examples/mpt/README.md

@@ -1,27 +0,0 @@
-# MPT
-
-Ref: https://github.com/mosaicml/llm-foundry#mpt
-
-## Usage
-
-```bash
-# get the repo and build it
-git clone https://github.com/ggerganov/ggml
-cd ggml
-mkdir build && cd build
-cmake ..
-make -j
-
-# get the model from HuggingFace
-# be sure to have git-lfs installed
-git clone https://huggingface.co/mosaicml/mpt-30b
-
-# convert model to FP16
-python3 ../examples/mpt/convert-h5-to-ggml.py ./mpt-30b 1
-
-# run inference using FP16 precision
-./bin/mpt -m ./mpt-30b/ggml-model-f16.bin -p "I believe the meaning of life is" -t 8 -n 64
-
-# quantize the model to 5-bits using Q5_0 quantization
-./bin/mpt-quantize ./mpt-30b/ggml-model-f16.bin ./mpt-30b/ggml-model-q5_0.bin q5_0
-```

+ 0 - 169
ggml/examples/mpt/convert-h5-to-ggml.py

@@ -1,169 +0,0 @@
-import os
-import struct
-import sys
-
-import torch
-from transformers import AutoConfig, AutoTokenizer
-
-
-# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1))
-        + list(range(ord("¡"), ord("¬") + 1))
-        + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-
-    cs = [chr(n) for n in cs]
-
-    return dict(zip(bs, cs))
-
-
-def count_model_parts(dir_model: str) -> int:
-    """Returns the number of model parts in the model directory."""
-    num_parts = 0
-    for filename in os.listdir(dir_model):
-        if filename.startswith("pytorch_model-"):
-            num_parts += 1
-
-    if num_parts > 0:
-        print(f"Found {num_parts} model parts in {dir_model}")
-    return num_parts
-
-
-if len(sys.argv) < 3:
-    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
-    print("  ftype == 0 -> float32")
-    print("  ftype == 1 -> float16")
-    sys.exit(1)
-
-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-# get number of model parts
-num_parts = count_model_parts(dir_model)
-
-# possible data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-#
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-ftype = 1
-if len(sys.argv) > 2:
-    ftype = int(sys.argv[2])
-    if ftype < 0 or ftype > 1:
-        print("Invalid ftype: " + str(ftype))
-        sys.exit(1)
-    fname_out = dir_model + "/ggml-model-" + ftype_str[ftype] + ".bin"
-
-
-tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
-config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
-hparams = config.to_dict()
-
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676D6C))  # magic: ggml in hex
-fout.write(struct.pack("i", hparams["d_model"]))
-fout.write(struct.pack("i", hparams["max_seq_len"]))
-fout.write(struct.pack("i", hparams["n_heads"]))
-fout.write(struct.pack("i", hparams["n_layers"]))
-fout.write(struct.pack("i", hparams["vocab_size"]))
-fout.write(struct.pack("f", hparams["attn_config"]["alibi_bias_max"]))
-fout.write(struct.pack("f", hparams["attn_config"]["clip_qkv"] or 0.0))
-fout.write(struct.pack("i", ftype))
-
-vocab_size = hparams["vocab_size"]
-
-encoder = tokenizer.vocab
-# Add added_tokens (special tokens) to the encoder
-encoder.update(tokenizer.get_added_vocab())
-
-byte_encoder = bytes_to_unicode()
-byte_decoder = {v: k for k, v in byte_encoder.items()}
-
-counter = 0
-# sort by value
-for key in sorted(encoder, key=encoder.get):
-    # workaround for key error when c not found
-    text = ""
-    for c in key:
-        if c not in byte_decoder:
-            text += c
-        else:
-            text += chr(byte_decoder[c])
-    text = bytearray(text, encoding="utf-8")
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-    counter += 1
-
-# Repeat last token until vocab_size
-while counter < vocab_size:
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-    counter += 1
-
-if num_parts == 0:
-    part_names = ("pytorch_model.bin",)
-else:
-    part_names = (
-        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
-    )
-
-for part_name in part_names:
-    print(f"\n* Loading part: {part_name}")
-    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
-
-    for name in model_part.keys():
-        data = model_part[name].squeeze()
-        n_dims = len(data.shape)
-
-        # ftype == 0 -> float32, ftype == 1 -> float16
-        # default type is fp32
-        ftype_cur = 0
-        if ftype == 1 and name[-7:] == ".weight" and n_dims > 1:
-            ftype_cur = 1
-        data = data.to(dtype=torch.float16 if ftype_cur == 1 else torch.float32).numpy()
-
-        print(
-            "Processing variable: " + name + " with shape: ",
-            data.shape,
-            "->",
-            data.dtype,
-        )
-
-        # header
-        str = name.encode("utf-8")
-        fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
-        for i in range(n_dims):
-            fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-        fout.write(str)
-
-        # data
-        data.tofile(fout)
-
-    # release memory
-    del model_part
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")

+ 0 - 1039
ggml/examples/mpt/main.cpp

@@ -1,1039 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common-ggml.h"
-#include "common.h"
-
-#include <cmath>
-#include <cstddef>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <cinttypes>
-#include <map>
-#include <string>
-#include <utility>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// no defaults for now
-struct mpt_hparams {
-    int32_t d_model      = 0;
-    int32_t max_seq_len  = 0;
-    int32_t n_heads      = 0;
-    int32_t n_layers     = 0;
-    int32_t n_vocab      = 0;
-    float alibi_bias_max = 0;
-    float clip_qkv       = 0;
-    int32_t ftype        = 0;
-    int32_t n_ctx        = 0;
-
-};
-
-struct mpt_layer {
-    // pre normalization
-    struct ggml_tensor * norm_1_weight;
-
-    // attention
-    struct ggml_tensor * c_attn_wqkv_weight;
-    struct ggml_tensor * c_attn_out_proj_weight;
-
-    // post normalization
-    struct ggml_tensor * norm_2_weight;
-
-    // ff
-    struct ggml_tensor * ffn_up_proj;
-    struct ggml_tensor * ffn_down_proj;
-};
-
-struct mpt_model {
-    mpt_hparams hparams;
-
-    struct ggml_tensor * wte_weight;    // position embedding
-    struct ggml_tensor * norm_f_weight; // language model head
-
-    std::vector<mpt_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-struct mpt_params {
-    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
-
-    int32_t seed           = -1; // RNG seed
-    int32_t n_predict      = 200; // new tokens to predict
-    int32_t n_batch        = 8; // batch size for prompt processing
-    int32_t n_ctx          = 512;
-
-    std::string model      = ""; // model path
-    std::string prompt     = "";
-    std::string token_test = "";
-
-    bool    perplexity     = false;
-
-    // sampling parameters
-    int32_t top_k          = 0;
-    float   top_p          = 1.0f;
-    float   temp           = 0.8f;
-    int32_t repeat_last_n  = 64;
-    float   repeat_penalty = 1.02f;
-
-};
-
-void mpt_print_usage(int /*argc*/, char ** argv, const mpt_params & params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
-    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
-    fprintf(stderr, "                        prompt to start generation with (default: random)\n");
-    fprintf(stderr, "  -f FNAME, --file FNAME\n");
-    fprintf(stderr, "                        load prompt from a file\n");
-    fprintf(stderr, "  -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
-    fprintf(stderr, "                        test tokenization\n");
-    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
-    fprintf(stderr, "  --top_k N             top-k sampling (default: %d, 0 = n_vocab)\n", params.top_k);
-    fprintf(stderr, "  --top_p N             top-p sampling (default: %.2f)\n", params.top_p);
-    fprintf(stderr, "  --temp N              temperature (default: %.2f)\n", params.temp);
-    fprintf(stderr, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
-    fprintf(stderr, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
-    fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
-    fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
-    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    fprintf(stderr, "  -m FNAME, --model FNAME\n");
-    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "\n");
-}
-
-bool mpt_params_parse(int argc, char ** argv, mpt_params & params) {
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-
-        if (arg == "-s" || arg == "--seed") {
-            params.seed = std::stoi(argv[++i]);
-        } else if (arg == "-t" || arg == "--threads") {
-            params.n_threads = std::stoi(argv[++i]);
-        } else if (arg == "-p" || arg == "--prompt") {
-            params.prompt = argv[++i];
-        } else if (arg == "-n" || arg == "--n_predict") {
-            params.n_predict = std::stoi(argv[++i]);
-        } else if (arg == "--top_k") {
-            params.top_k = std::max(1, std::stoi(argv[++i]));
-        } else if (arg == "--top_p") {
-            params.top_p = std::stof(argv[++i]);
-        } else if (arg == "--temp") {
-            params.temp = std::stof(argv[++i]);
-        } else if (arg == "--repeat-last-n") {
-            params.repeat_last_n = std::stof(argv[++i]);
-        } else if (arg == "--repeat-penalty") {
-            params.repeat_penalty = std::stof(argv[++i]);
-        } else if (arg == "--perplexity") {
-            params.perplexity = true;
-        } else if (arg == "-c" || arg == "--ctx-size") {
-            params.n_ctx = std::stoi(argv[++i]);
-        } else if (arg == "-b" || arg == "--batch_size") {
-            params.n_batch = std::stoi(argv[++i]);
-        } else if (arg == "-m" || arg == "--model") {
-            params.model = argv[++i];
-        } else if (arg == "-h" || arg == "--help") {
-            mpt_print_usage(argc, argv, params);
-            exit(0);
-        } else if (arg == "-f" || arg == "--file") {
-            if (++i > argc) {
-                fprintf(stderr, "Invalid file param");
-                break;
-            }
-            std::ifstream file(argv[i]);
-            if (!file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-                break;
-            }
-            params.prompt.clear();
-            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
-            if (params.prompt.back() == '\n') {
-                params.prompt.pop_back();
-            }
-        } else if (arg == "-tt" || arg == "--token_test") {
-            params.token_test = argv[++i];
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            mpt_print_usage(argc, argv, params);
-            exit(0);
-        }
-    }
-
-    return true;
-}
-
-// load the model's weights from a file
-bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab) {
-    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *)&magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.d_model,        sizeof(hparams.d_model));
-        fin.read((char *) &hparams.max_seq_len,    sizeof(hparams.max_seq_len));
-        fin.read((char *) &hparams.n_heads,        sizeof(hparams.n_heads));
-        fin.read((char *) &hparams.n_layers,       sizeof(hparams.n_layers));
-        fin.read((char *) &hparams.n_vocab,        sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max));
-        fin.read((char *) &hparams.clip_qkv,       sizeof(hparams.clip_qkv));
-        fin.read((char *) &hparams.ftype,          sizeof(hparams.ftype));
-
-        hparams.n_ctx = std::min(hparams.max_seq_len, hparams.n_ctx);
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: d_model        = %d\n", __func__, hparams.d_model);
-        printf("%s: max_seq_len    = %d\n", __func__, hparams.max_seq_len);
-        printf("%s: n_ctx          = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_heads        = %d\n", __func__, hparams.n_heads);
-        printf("%s: n_layers       = %d\n", __func__, hparams.n_layers);
-        printf("%s: n_vocab        = %d\n", __func__, hparams.n_vocab);
-        printf("%s: alibi_bias_max = %f\n", __func__, hparams.alibi_bias_max);
-        printf("%s: clip_qkv       = %f\n", __func__, hparams.clip_qkv);
-        printf("%s: ftype          = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr          = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    {
-        const int32_t n_vocab = model.hparams.n_vocab;
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *) buf.data(), len);
-            word.assign(buf.data(), len);
-
-            // Convert token from utf-8
-            std::wstring word_multibytes = convert_to_wstring(word);
-            word.resize(word_multibytes.size());
-            for (size_t w = 0; w < word_multibytes.size(); w++) {
-                word[w] = uint8_t(word_multibytes[w]);
-            }
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit
-    // floats or quantized in order to save memory and also to speed up the
-    // computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype)(model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", __func__, fname.c_str(),
-                model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    const auto & hparams = model.hparams;
-    const size_t n_ctx = hparams.n_ctx;
-
-    {
-        const size_t n_embd = hparams.d_model;
-        const size_t n_layer = hparams.n_layers;
-        const size_t n_vocab = hparams.n_vocab;
-
-        ctx_size += n_embd * n_vocab * ggml_type_sizef(wtype); // wte_weight
-        ctx_size += n_embd * ggml_type_sizef(GGML_TYPE_F32);   // norm_f_weight
-
-        ctx_size += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32));      // ln_1_weight
-        ctx_size += n_layer * (3 * n_embd * n_embd * ggml_type_sizef(wtype)); // attn_Wqkv_weight
-        ctx_size += n_layer * (n_embd * n_embd * ggml_type_sizef(wtype));     // attn_out_proj_weight
-        ctx_size += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32));      // ln_2_weight
-        ctx_size += n_layer * (4 * n_embd * n_embd * ggml_type_sizef(wtype)); // mlp_mlp_up_weight
-        ctx_size += n_layer * (n_embd * n_embd * 4 * ggml_type_sizef(wtype)); // mlp_mlp_down_weight
-
-        ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_k
-        ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_v
-
-        ctx_size += (1 + 6 * n_layer) * 512; // object overhead
-
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size / (1024.0 * 1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const size_t n_embd = hparams.d_model;
-        const size_t n_layer = hparams.n_layers;
-        const size_t n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        model.wte_weight    = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
-        model.norm_f_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        // map by name
-        model.tensors["transformer.wte.weight"]    = model.wte_weight;
-        model.tensors["transformer.norm_f.weight"] = model.norm_f_weight;
-
-        for (int i = 0; i < (int) n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.norm_1_weight          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,     n_embd);
-            layer.c_attn_wqkv_weight     = ggml_new_tensor_2d(ctx, wtype,             n_embd, 3 * n_embd);
-            layer.c_attn_out_proj_weight = ggml_new_tensor_2d(ctx, wtype,             n_embd,     n_embd);
-            layer.norm_2_weight          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,     n_embd);
-            layer.ffn_up_proj            = ggml_new_tensor_2d(ctx, wtype,             n_embd, 4 * n_embd);
-            layer.ffn_down_proj          = ggml_new_tensor_2d(ctx, wtype,         4 * n_embd,     n_embd);
-
-            // map by name
-            model.tensors["transformer.blocks." + std::to_string(i) + ".norm_1.weight"]        = layer.norm_1_weight;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".attn.Wqkv.weight"]     = layer.c_attn_wqkv_weight;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".attn.out_proj.weight"] = layer.c_attn_out_proj_weight;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".norm_2.weight"]        = layer.norm_2_weight;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.up_proj.weight"]   = layer.ffn_up_proj;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.down_proj.weight"] = layer.ffn_down_proj;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const size_t n_embd  = hparams.d_model;
-        const size_t n_layer = hparams.n_layers;
-
-        const int64_t n_mem      = n_layer * n_ctx;
-        const int64_t n_elements = n_embd  * n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size / 1024.0 / 1024.0, n_mem);
-    }
-
-    // load weights
-    {
-        int n_tensors = 0;
-        size_t total_size = 0;
-
-        printf("%s: ", __func__);
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = {1, 1};
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            auto tensor = model.tensors[name];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr,
-                        "%s: tensor '%s' has wrong shape in model file: got [%5d, "
-                        "%5d], expected [%5d, %5d]\n",
-                        __func__, name.c_str(), (int)tensor->ne[0], (int)tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1],
-                       ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor) / 1024.0 / 1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements * bpe) / ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr,
-                        "%s: tensor '%s' has wrong size in model file: got %zu, "
-                        "expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements * bpe);
-                return false;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-            total_size += ggml_nbytes(tensor);
-            if (++n_tensors % 8 == 0) {
-                printf(".");
-                fflush(stdout);
-            }
-        }
-
-        printf(" done\n");
-
-        printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size / 1024.0 / 1024.0, n_tensors);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
-              const std::vector<gpt_vocab::id> & embd_inp, std::vector<float> & embd_w, bool logits_all, size_t & mem_per_token) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.d_model;
-    const int n_layer = hparams.n_layers;
-    const int n_head  = hparams.n_heads;
-    const int n_vocab = hparams.n_vocab;
-    const int n_ctx   = hparams.n_ctx;
-    const float eps   = 1e-5f;
-
-    static size_t buf_size = 256u * 1024 * 1024;
-    static void * buf = malloc(buf_size);
-
-    // use 2 scratch buffers
-    // TODO: very hacky solution - reimplement in a more elegant way
-    static size_t scr0_size = 256u*1024*1024;
-    static void * scr0 = malloc(scr0_size);
-
-    static size_t scr1_size = 256u*1024*1024;
-    static void * scr1 = malloc(scr1_size);
-
-    if (mem_per_token > 0 && mem_per_token * N > buf_size) {
-        const size_t buf_size_new = 1.1 * (mem_per_token * N); // add 10% to account for ggml object overhead
-        // printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__,
-        // buf_size, buf_size_new);
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = {};
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd));
-
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte_weight, embd);
-
-    for (int il = 0; il < n_layer; ++il) {
-
-        struct ggml_tensor * cur;
-
-        ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
-
-        // a = self.ln_1(x)
-        {
-            cur = ggml_norm(ctx0, inpL, eps);
-
-            cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_1_weight, cur), cur);
-        }
-
-        // self-attention
-        //  b, _, past_key_value = self.attn(a, past_key_value=past_key_value,
-        //  attn_bias=attn_bias, attention_mask=attention_mask,
-        //  is_causal=is_causal)
-        {
-            // compute QKV
-            cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_wqkv_weight, cur);
-
-            if (model.hparams.clip_qkv > 0.0f) {
-                cur = ggml_clamp(ctx0, cur, -model.hparams.clip_qkv, model.hparams.clip_qkv);
-            }
-
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0 * sizeof(float) * n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1 * sizeof(float) * n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2 * sizeof(float) * n_embd);
-
-            // store key and value to memory
-            {
-                struct ggml_tensor * k =
-                    ggml_view_1d(ctx0, model.memory_k, N * n_embd,
-                                 (ggml_element_size(model.memory_k) * n_embd) * (il * n_ctx + n_past));
-                struct ggml_tensor * v =
-                    ggml_view_1d(ctx0, model.memory_v, N * n_embd,
-                                 (ggml_element_size(model.memory_v) * n_embd) * (il * n_ctx + n_past));
-
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0,
-            // 2, 1, 3) [64, N, 12]
-            struct ggml_tensor * Q = ggml_permute(
-                ctx0, ggml_cpy(ctx0, Qcur, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd / n_head, n_head, N)), 0, 2,
-                1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1,
-            // 3) [64, n_past + N, 12]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                             ggml_reshape_3d(ctx0,
-                                             ggml_view_1d(ctx0, model.memory_k, (n_past + N) * n_embd,
-                                                          il * n_ctx * ggml_element_size(model.memory_k) * n_embd),
-                                             n_embd / n_head, n_head, n_past + N),
-                             0, 2, 1, 3);
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0, KQ, ggml_new_f32(ctx0, 1.0f / sqrt(float(n_embd) / n_head)));
-
-            struct ggml_tensor * KQ_scaled_alibi =
-                ggml_alibi(ctx0, KQ_scaled, n_past, n_head, model.hparams.alibi_bias_max);
-
-            // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1,
-            // 2, 0, 3).contiguous() [n_past + N, 64, 12]
-            struct ggml_tensor * V_trans = ggml_cpy(
-                ctx0,
-                ggml_permute(ctx0,
-                             ggml_reshape_3d(ctx0,
-                                             ggml_view_1d(ctx0, model.memory_v, (n_past + N) * n_embd,
-                                                          il * n_ctx * ggml_element_size(model.memory_v) * n_embd),
-                                             n_embd / n_head, n_head, n_past + N),
-                             1, 2, 0, 3),
-                ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd / n_head, n_head));
-
-            // KQV = transpose(V) * KQ_soft_max
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection
-            { cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_out_proj_weight, cur); }
-        }
-
-        inpL = ggml_add(ctx0, inpL, cur);
-
-        ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
-
-        // m = self.ln_2(x)
-        {
-            cur = ggml_norm(ctx0, inpL, eps);
-
-            cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_2_weight, cur), cur);
-        }
-
-        // n = self.mlp(m)
-        {
-
-            cur = ggml_mul_mat(ctx0, model.layers[il].ffn_up_proj, cur);
-
-            // GELU activation
-            cur = ggml_gelu(ctx0, cur);
-
-            // projection
-            // cur = proj_w*cur + proj_b
-            cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down_proj, cur);
-        }
-
-        // x = x + n
-        inpL = ggml_add(ctx0, inpL, cur);
-    }
-
-    ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
-
-    // norm
-    {
-        inpL = ggml_norm(ctx0, inpL, eps);
-        // inpL = ln_f_g*inpL
-        inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_weight, inpL), inpL);
-    }
-
-    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-
-    // output embedding weight tied to input embedding
-    inpL = ggml_mul_mat(ctx0, model.wte_weight, inpL);
-
-    // logits -> probs
-    // inpL = ggml_soft_max(ctx0, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-
-    // std::cout << "Qcur" << std::endl;
-    // print_tensor(Qcur);
-
-    // if (n_past%100 == 0) {
-    // ggml_graph_print(&gf);
-    // ggml_graph_dump_dot(&gf, NULL, "mpt-model.dot");
-    // }
-
-    if (logits_all) {
-        // return result for all tokens
-        embd_w.resize(n_vocab *N);
-        memcpy(embd_w.data(), (float *)ggml_get_data(inpL) , sizeof(float) * n_vocab * N);
-    } else {
-        // return result for just the last token
-        embd_w.resize(n_vocab);
-        memcpy(embd_w.data(), (float *)ggml_get_data(inpL) + (n_vocab * (N - 1)), sizeof(float) * n_vocab);
-    }
-
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0) / N;
-    }
-    // printf("used_mem = %zu\n", ggml_used_mem(ctx0));
-
-    ggml_free(ctx0);
-
-    return true;
-}
-
-std::vector<float> softmax(const std::vector<float> & logits) {
-    std::vector<float> probs(logits.size());
-    float max_logit = logits[0];
-    for (float v : logits) max_logit = std::max(max_logit, v);
-    double sum_exp = 0.0;
-    for (size_t i = 0; i < logits.size(); i++) {
-        // Subtract the maximum logit value from the current logit value for numerical stability
-        const float logit = logits[i] - max_logit;
-        const float exp_logit = expf(logit);
-        sum_exp += exp_logit;
-        probs[i] = exp_logit;
-    }
-    for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
-    return probs;
-}
-
-int perplexity(const mpt_params & params) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    printf("%s: n_threads = %d\n", __func__, params.n_threads);
-    printf("%s: n_batch   = %d\n", __func__, params.n_batch);
-    printf("%s: n_ctx     = %d\n", __func__, params.n_ctx);
-    printf("\n");
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    mpt_model model;
-
-    model.hparams.n_ctx = params.n_ctx;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!mpt_model_load(params.model, model, vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-    }
-
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<int> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-    mpt_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, false, mem_per_token);
-
-    int count   = 0;
-
-    const int n_chunk = embd_inp.size() / params.n_ctx;
-
-    const int n_vocab = model.hparams.n_vocab;
-    const int n_batch = params.n_batch;
-
-    double nll = 0.0;
-    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
-
-    for (int i = 0; i < n_chunk; ++i) {
-
-        const int start =     i * params.n_ctx;
-        const int end   = start + params.n_ctx;
-
-        const int num_batches = (params.n_ctx + n_batch - 1) / n_batch;
-
-        std::vector<float> logits;
-
-        const auto t_start = std::chrono::high_resolution_clock::now();
-
-        for (int j = 0; j < num_batches; ++j) {
-
-            const int batch_start = start + j * n_batch;
-            const int batch_size  = std::min(end - batch_start, n_batch);
-
-            std::vector<gpt_vocab::id> embd;
-
-            for(int p=0;p<batch_size;p++) {
-                embd.push_back( embd_inp[batch_start+p]  );
-            }
-
-            std::vector<float> batch_logits;// = llama_get_logits(ctx);
-
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!mpt_eval(model, params.n_threads, j * batch_size, embd, batch_logits, true, mem_per_token)) {
-                printf("%s: failed to evaluate model\n", __func__);
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-
-            logits.insert(logits.end(), batch_logits.data(), batch_logits.data() + batch_size * n_vocab);
-
-        }
-
-        const auto t_end = std::chrono::high_resolution_clock::now();
-
-        if (i == 0) {
-            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
-            int total_seconds = (int)(t_total * n_chunk);
-            if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
-                total_seconds = total_seconds % (60*60);
-            }
-            fprintf(stderr, "%d minutes\n", total_seconds / 60);
-
-            printf("\nChunk\tPPL cumulative\tPPL chunk\n");
-        }
-
-        // We get the logits for all the tokens in the context window (params.n_ctx)
-        // from llama_eval above.  Now, based on https://huggingface.co/docs/transformers/perplexity,
-        // calculate the perplexity over the last half of the window (so the model always has
-        // some context to predict the token).
-        //
-        // We rely on the fact that attention in the forward pass only looks at previous
-        // tokens here, so the logits returned for each token are an accurate representation
-        // of what the model would have predicted at that point.
-        //
-        // Example, we have a context window of 512, we will compute perplexity for each of the
-        // last 256 tokens.  Then, we split the input up into context window size chunks to
-        // process the entire prompt.
-
-        double nllchunk = 0.0;
-        int countchunk = 0;
-
-        for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
-            // Calculate probability of next token, given the previous ones.
-            const std::vector<float> tok_logits(
-                logits.begin() + (j + 0) * n_vocab,
-                logits.begin() + (j + 1) * n_vocab);
-
-            const float prob = softmax(tok_logits)[embd_inp[ start+ j + 1]];
-
-            nllchunk += -std::log(prob);
-            ++countchunk;
-        }
-
-		nll += nllchunk;
-		count += countchunk;
-
-        // perplexity is e^(average negative log-likelihood)
-        printf("%d\t%.8lf\t%.8lf\n", i + 1, std::exp(nll / count), std::exp(nllchunk/countchunk) );
-        fflush(stdout);
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n",   __func__, t_load_us / 1000.0f);
-        printf("%s:     eval time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us / 1000.0f, t_predict_us / 1000.0f / (n_chunk * params.n_ctx));
-        printf("%s:    total time = %8.2f ms\n",   __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}
-
-int main(int argc, char ** argv) {
-    mpt_params params;
-
-    if (mpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.perplexity) {
-        return perplexity(params);
-    }
-
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    if (params.n_predict < 0) {
-        params.n_predict = 0;
-    }
-
-    printf("%s: seed      = %d\n",   __func__, params.seed);
-    printf("%s: n_threads = %d\n",   __func__, params.n_threads);
-    printf("%s: n_batch   = %d\n",   __func__, params.n_batch);
-    printf("%s: n_ctx     = %d\n",   __func__, params.n_ctx);
-    printf("%s: n_predict = %d\n\n", __func__, params.n_predict);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    mpt_model model;
-
-    model.hparams.n_ctx = params.n_ctx;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!mpt_model_load(params.model, model, vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-        test_gpt_tokenizer(vocab, params.token_test);
-    }
-
-    if (params.top_k == 0) {
-        params.top_k = model.hparams.n_vocab;
-    }
-
-    if (params.repeat_last_n == -1) {
-        params.repeat_last_n = params.n_ctx;
-    }
-
-    printf("\n");
-    printf("%s: temp           = %.3f\n", __func__, params.temp);
-    printf("%s: top_k          = %d\n",   __func__, params.top_k);
-    printf("%s: top_p          = %.3f\n", __func__, params.top_p);
-    printf("%s: repeat_last_n  = %d\n",   __func__, params.repeat_last_n);
-    printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty);
-
-    int64_t t_sample_us = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<int32_t> last_n_tokens(params.n_ctx);
-    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
-
-    // tokenize the prompt
-    std::vector<int> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    printf("\n");
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-
-    for (size_t i = 0; i < embd_inp.size(); i++) {
-        printf("%s: token[%zu] = %6d\n", __func__, i, embd_inp[i]);
-    }
-    printf("\n");
-
-    std::vector<gpt_vocab::id> embd;
-    std::vector<float> logits;
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-    mpt_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, false, mem_per_token);
-
-    int n_past     = 0;
-    int n_consumed = 0;
-    int n_sampled  = 0;
-
-    while (n_sampled < params.n_predict) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!mpt_eval(model, params.n_threads, n_past, embd, logits, false, mem_per_token)) {
-                printf("%s: failed to predict\n", __func__);
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-
-            n_past += embd.size();
-            embd.clear();
-        }
-
-        if ((int)embd_inp.size() <= n_consumed) {
-            // sample next token
-
-            const int top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp = params.temp;
-            const int repeat_last_n = params.repeat_last_n;
-            const float repeat_penalty = params.repeat_penalty;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p_repeat(vocab, logits.data() + (logits.size() - model.hparams.n_vocab), last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_last_n, repeat_penalty, rng);
-
-                last_n_tokens.erase(last_n_tokens.begin());
-                last_n_tokens.push_back(id);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-            ++n_sampled;
-
-        } else {
-            // if here, it means we are still processing the input prompt
-            while ((int) embd_inp.size() > n_consumed) {
-                embd.push_back(embd_inp[n_consumed]);
-
-                last_n_tokens.erase(last_n_tokens.begin());
-                last_n_tokens.push_back(embd_inp[n_consumed]);
-
-                ++n_consumed;
-                if ((int) embd.size() >= params.n_batch) {
-                    break;
-                }
-            }
-        }
-
-        // display text
-        for (auto id : embd) {
-           printf("%s", vocab.id_to_token[id].c_str());
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (embd.back() == 0) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n\n");
-        printf("%s: sampled tokens = %8d\n", __func__, n_sampled);
-        printf("%s:  mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:      load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
-        printf("%s:    sample time = %8.2f ms / %.2f ms per token\n", __func__, t_sample_us / 1000.0f, t_sample_us / 1000.0f / n_sampled);
-        printf("%s:      eval time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us / 1000.0f, t_predict_us / 1000.0f / n_past);
-        printf("%s:     total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}

+ 0 - 186
ggml/examples/mpt/quantize.cpp

@@ -1,186 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common-ggml.h"
-#include "common.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <regex>
-#include <string>
-#include <vector>
-
-struct mpt_hparams {
-    int32_t d_model      = 0;
-    int32_t max_seq_len  = 0;
-    int32_t n_heads      = 0;
-    int32_t n_layers     = 0;
-    int32_t n_vocab      = 0;
-    float alibi_bias_max = 0;
-    float clip_qkv       = 0;
-    int32_t ftype        = 0;
-};
-
-// quantize a model
-bool mpt_model_quantize(const std::string & fname_inp,
-                        const std::string & fname_out, ggml_ftype ftype) {
-
-    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
-
-    auto finp = std::ifstream(fname_inp, std::ios::binary);
-    if (!finp) {
-        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__,
-                fname_inp.c_str());
-        return false;
-    }
-
-    auto fout = std::ofstream(fname_out, std::ios::binary);
-    if (!fout) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__,
-                fname_out.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        finp.read((char *)&magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n",
-                    __func__, fname_inp.c_str());
-            return false;
-        }
-
-        fout.write((char *)&magic, sizeof(magic));
-    }
-
-    mpt_hparams hparams;
-
-    // load hparams
-    {
-        finp.read((char *) &hparams.d_model,        sizeof(hparams.d_model));
-        finp.read((char *) &hparams.max_seq_len,    sizeof(hparams.max_seq_len));
-        finp.read((char *) &hparams.n_heads,        sizeof(hparams.n_heads));
-        finp.read((char *) &hparams.n_layers,       sizeof(hparams.n_layers));
-        finp.read((char *) &hparams.n_vocab,        sizeof(hparams.n_vocab));
-        finp.read((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max));
-        finp.read((char *) &hparams.clip_qkv,       sizeof(hparams.clip_qkv));
-        finp.read((char *) &hparams.ftype,          sizeof(hparams.ftype));
-
-        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
-        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
-
-        printf("%s: d_model        = %d\n", __func__, hparams.d_model);
-        printf("%s: max_seq_len    = %d\n", __func__, hparams.max_seq_len);
-        printf("%s: n_heads        = %d\n", __func__, hparams.n_heads);
-        printf("%s: n_layers       = %d\n", __func__, hparams.n_layers);
-        printf("%s: n_vocab        = %d\n", __func__, hparams.n_vocab);
-        printf("%s: alibi_bias_max = %f\n", __func__, hparams.alibi_bias_max);
-        printf("%s: clip_qkv       = %f\n", __func__, hparams.clip_qkv);
-        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
-        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
-        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
-
-        fout.write((char *) &hparams.d_model,        sizeof(hparams.d_model));
-        fout.write((char *) &hparams.max_seq_len,    sizeof(hparams.max_seq_len));
-        fout.write((char *) &hparams.n_heads,        sizeof(hparams.n_heads));
-        fout.write((char *) &hparams.n_layers,       sizeof(hparams.n_layers));
-        fout.write((char *) &hparams.n_vocab,        sizeof(hparams.n_vocab));
-        fout.write((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max));
-        fout.write((char *) &hparams.clip_qkv,       sizeof(hparams.clip_qkv));
-        fout.write((char *) &ftype_dst,              sizeof(ftype_dst));
-    }
-
-    // load vocab
-    {
-        const int32_t n_vocab = hparams.n_vocab;
-
-        std::string word;
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            finp.read((char *)&len, sizeof(len));
-            fout.write((char *)&len, sizeof(len));
-
-            word.resize(len);
-            finp.read((char *)word.data(), len);
-            fout.write((char *)word.data(), len);
-        }
-    }
-
-    printf("%s: quantizing tensors\n", __func__);
-
-    // regexes of tensor names to be quantized
-    const std::vector<std::string> to_quant = {
-        ".*weight",
-    };
-
-    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
-        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__,
-                fname_inp.c_str());
-        return false;
-    }
-
-    finp.close();
-    fout.close();
-
-    return true;
-}
-
-// usage:
-//  ./mpt-quantize models/mpt/ggml-model.bin
-//  models/mpt/ggml-model-quant.bin type
-//
-int main(int argc, char ** argv) {
-    if (argc != 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n",
-                argv[0]);
-        ggml_print_ftypes(stderr);
-        return 1;
-    }
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = {0, NULL, false};
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
-
-    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    int64_t t_quantize_us = 0;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!mpt_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
-            fprintf(stderr, "%s: failed to quantize model from '%s'\n",
-                    __func__, fname_inp.c_str());
-            return 1;
-        }
-
-        t_quantize_us = ggml_time_us() - t_start_us;
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n");
-        printf("%s: quantize time = %8.2f ms\n", __func__,
-               t_quantize_us / 1000.0f);
-        printf("%s:    total time = %8.2f ms\n", __func__,
-               (t_main_end_us - t_main_start_us) / 1000.0f);
-    }
-
-    return 0;
-}

+ 0 - 100
ggml/examples/prompts/dolly-v2.txt

@@ -1,100 +0,0 @@
-Hello World! => 12092,3645,2
-I can't believe it's already Friday!" => 42,476,626,2868,352,434,2168,6794,1476
-The URL for the website is https://www.example.com." => 510,10611,323,253,4422,310,5987,1358,2700,15,11667,15,681,449
-"She said, 'I love to travel.'" => 3,2993,753,13,686,42,2389,281,4288,18574
-'The temperature is 25.5°C.' => 8,510,3276,310,2030,15,22,3272,36,2464
-"Let's meet at 2:30 p.m. in the park." => 3,1466,434,2525,387,374,27,1229,268,15,78,15,275,253,5603,449
-The book costs $19.99 => 510,1984,4815,370,746,15,1525
-"John's favorite color is blue." => 3,8732,434,7583,3295,310,4797,449
-Th@nk y0u f0r y0ur h3lp! => 1044,33,30664,340,17,86,269,17,83,340,17,321,288,20,24343,2
-C@n I g3t a c0ffee, pl3@se? => 36,33,79,309,305,20,85,247,260,17,71,6851,13,499,20,33,339,32
-W0w! Th@t's @m@zing! => 56,17,88,2,596,33,85,434,1214,78,33,8537,2
-H0w 4re y0u t0d@y? => 41,17,88,577,250,340,17,86,246,17,69,33,90,32
-I l0ve t0 tr@vel @r0und the w0rld. => 42,298,17,306,246,17,492,33,652,1214,83,17,1504,253,259,17,83,392,15
-Wh@t's y0ur f@v0rite m0vie? => 3152,33,85,434,340,17,321,269,33,87,17,3852,278,17,25858,32
-The cat is sleeping on the mat. => 510,5798,310,14343,327,253,1111,15
-I need to buy some groceries for dinner. => 42,878,281,4489,690,45160,447,323,8955,15
-The sun is shining brightly in the sky. => 510,5101,310,28115,43925,275,253,8467,15
-She is reading a book in the park. => 2993,310,4361,247,1984,275,253,5603,15
-We went for a walk on the beach yesterday. => 1231,2427,323,247,2940,327,253,11600,11066,15
-He plays the guitar like a pro. => 1328,7120,253,12609,751,247,354,15
-They are going to the movies tonight. => 3726,403,1469,281,253,11321,11608,15
-The flowers are blooming in the garden. => 510,12405,403,30601,272,275,253,10329,15
-I enjoy listening to classical music. => 42,4264,11298,281,8946,3440,15
-We need to buy groceries for the week. => 1231,878,281,4489,45160,447,323,253,2129,15
-The dog is chasing its tail in circles. => 510,4370,310,31702,697,8105,275,14240,15
-She is wearing a beautiful red dress. => 2993,310,9398,247,5389,2502,7619,15
-He is a talented actor in Hollywood. => 1328,310,247,21220,12353,275,14759,15
-The children are playing in the playground. => 510,2151,403,4882,275,253,41008,15
-I'm going to visit my grandparents this weekend. => 42,1353,1469,281,4143,619,37186,436,8849,15
-The coffee tastes bitter without sugar. => 510,8574,27491,17123,1293,8618,15
-They are planning a surprise party for her. => 3726,403,7219,247,9326,3128,323,617,15
-She sings like an angel on stage. => 2993,44718,751,271,23087,327,3924,15
-We should take a vacation to relax. => 1231,943,1379,247,18125,281,7921,15
-He is studying medicine at the university. => 1328,310,12392,9921,387,253,9835,15
-The rain is pouring heavily outside. => 510,9313,310,31226,11306,3345,15
-I enjoy watching romantic movies. => 42,4264,7487,18109,11321,15
-They are celebrating their anniversary today. => 3726,403,28765,616,19054,3063,15
-She dances gracefully to the music. => 2993,47078,14426,2920,281,253,3440,15
-He is an excellent basketball player. => 1328,310,271,7126,14648,4760,15
-The baby is sleeping soundly in the crib. => 510,6858,310,14343,3590,314,275,253,260,725,15
-I need to finish my homework before dinner. => 42,878,281,8416,619,32110,1078,8955,15
-They are organizing a charity event next month. => 3726,403,26169,247,19489,2362,1735,1770,15
-She is cooking a delicious meal for us. => 2993,310,12398,247,17319,11484,323,441,15
-We should go hiking in the mountains. => 1231,943,564,33061,275,253,14700,15
-The car broke down on the way to work. => 510,1113,9377,1066,327,253,1039,281,789,15
-He loves playing video games in his free time. => 1328,14528,4882,3492,3958,275,521,1959,673,15
-The birds are chirping in the trees. => 510,11260,403,36494,14650,275,253,7139,15
-I want to learn how to play the piano. => 42,971,281,3037,849,281,1132,253,18542,15
-They are building a new shopping mall in the city. => 3726,403,3652,247,747,12701,28974,275,253,2846,15
-She is writing a novel in her spare time. => 2993,310,4028,247,4460,275,617,18345,673,15
-We are going to the zoo this Saturday. => 1231,403,1469,281,253,41089,436,7814,15
-The cake looks delicious with chocolate frosting. => 510,15221,4453,17319,342,14354,34724,272,15
-He is a talented painter who sells his artwork. => 1328,310,247,21220,27343,665,27924,521,28227,15
-The students are studying for their exams. => 510,3484,403,12392,323,616,34666,15
-I enjoy swimming in the ocean. => 42,4264,17120,275,253,12927,15
-They are renovating their house. => 3726,403,30074,839,616,2419,15
-She is practicing yoga to stay healthy. => 2993,310,25815,25551,281,3297,5875,15
-We should plant flowers in the garden. => 1231,943,4444,12405,275,253,10329,15
-The traffic is heavy during rush hour. => 510,7137,310,5536,1309,16949,4964,15
-He is a skilled chef who creates amazing dishes. => 1328,310,247,18024,26540,665,10513,8644,17114,15
-The baby is crawling on the floor. => 510,6858,310,44922,327,253,5254,15
-I need to buy a new pair of shoes. => 42,878,281,4489,247,747,4667,273,12682,15
-They are going on a road trip across the country. => 3726,403,1469,327,247,3971,7408,2439,253,2586,15
-She is playing the piano beautifully. => 2993,310,4882,253,18542,27839,15
-We are going to a concert tomorrow night. => 1231,403,1469,281,247,12699,10873,2360,15
-The cake tastes delicious with vanilla frosting. => 510,15221,27491,17319,342,26724,34724,272,15
-He is a dedicated teacher who inspires his students. => 1328,310,247,9940,9732,665,6381,2731,521,3484,15
-The students are participating in a science fair. => 510,3484,403,15299,275,247,5859,4344,15
-I enjoy hiking in the mountains. => 42,4264,33061,275,253,14700,15
-They are organizing a beach cleanup next weekend. => 3726,403,26169,247,11600,34709,1735,8849,15
-She is taking photographs of nature. => 2993,310,3192,15928,273,3753,15
-We should try a new restaurant in town. => 1231,943,1611,247,747,10301,275,3874,15
-The traffic is moving slowly on the highway. => 510,7137,310,4886,7808,327,253,17657,15
-He is a talented singer with a beautiful voice. => 1328,310,247,21220,16057,342,247,5389,4318,15
-The baby is laughing and giggling. => 510,6858,310,17053,285,41542,1981,15
-I need to do laundry and wash my clothes. => 42,878,281,513,29023,285,14841,619,10015,15
-They are planning a trip to Europe. => 3726,403,7219,247,7408,281,3060,15
-She is learning how to play the guitar. => 2993,310,4715,849,281,1132,253,12609,15
-We are going to a museum this Sunday. => 1231,403,1469,281,247,16064,436,6926,15
-The coffee smells amazing in the morning. => 510,8574,34247,8644,275,253,4131,15
-He is a hardworking farmer who grows crops. => 1328,310,247,1892,21107,24718,665,17202,19492,15
-The students are presenting their research projects. => 510,3484,403,15250,616,2561,6493,15
-I enjoy playing soccer with my friends. => 42,4264,4882,20391,342,619,3858,15
-They are volunteering at a local shelter. => 3726,403,10057,2158,387,247,1980,17824,15
-She is practicing martial arts for self-defense. => 2993,310,25815,29731,14635,323,1881,14,29337,15
-We should try a new recipe for dinner. => 1231,943,1611,247,747,13612,323,8955,15
-The traffic is congest => 510,7137,310,25801
-The sun is shining brightly today. => 510,5101,310,28115,43925,3063,15
-I enjoy reading books in my free time. => 42,4264,4361,5098,275,619,1959,673,15
-She plays the piano beautifully. => 2993,7120,253,18542,27839,15
-The cat chased the mouse around the room. => 510,5798,40754,253,6521,1475,253,2316,15
-I love eating pizza with extra cheese. => 42,2389,9123,22534,342,4465,12173,15
-He always wears a hat wherever he goes. => 1328,1900,31394,247,7856,20312,344,4566,15
-The flowers in the garden are blooming. => 510,12405,275,253,10329,403,30601,272,15
-She danced gracefully on the stage. => 2993,39860,14426,2920,327,253,3924,15
-The dog barked loudly in the park. => 510,4370,21939,264,31311,275,253,5603,15
-We went swimming in the ocean yesterday. => 1231,2427,17120,275,253,12927,11066,15
-He speaks fluent French and Spanish. => 1328,16544,2938,290,5112,285,9883,15
-The train arrived at the station on time. => 510,6194,7244,387,253,4660,327,673,15
-She cooked a delicious meal for her family. => 2993,18621,247,17319,11484,323,617,2021,15

+ 0 - 1
ggml/examples/prompts/gpt-2-chinese.txt

@@ -1 +0,0 @@
-请问洗手间在哪里? => 6435,7309,3819,2797,7313,1762,1525,7027,8043

+ 0 - 100
ggml/examples/prompts/gpt-2.txt

@@ -1,100 +0,0 @@
-Hello World! => 15496,2159,0
-I can't believe it's already Friday!" => 40,460,470,1975,340,338,1541,3217,2474
-The URL for the website is https://www.example.com." => 464,10289,329,262,3052,318,3740,1378,2503,13,20688,13,785,526
-"She said, 'I love to travel.'" => 1,3347,531,11,705,40,1842,284,3067,11496
-'The temperature is 25.5°C.' => 6,464,5951,318,1679,13,20,7200,34,2637
-"Let's meet at 2:30 p.m. in the park." => 1,5756,338,1826,379,362,25,1270,279,13,76,13,287,262,3952,526
-The book costs $19.99 => 464,1492,3484,720,1129,13,2079
-"John's favorite color is blue." => 1,7554,338,4004,3124,318,4171,526
-Th@nk y0u f0r y0ur h3lp! => 817,31,77,74,331,15,84,277,15,81,331,15,333,289,18,34431,0
-C@n I g3t a c0ffee, pl3@se? => 34,31,77,314,308,18,83,257,269,15,5853,11,458,18,31,325,30
-W0w! Th@t's @m@zing! => 54,15,86,0,536,31,83,338,2488,76,31,9510,0
-H0w 4re y0u t0d@y? => 39,15,86,604,260,331,15,84,256,15,67,31,88,30
-I l0ve t0 tr@vel @r0und the w0rld. => 40,300,15,303,256,15,491,31,626,2488,81,15,917,262,266,15,81,335,13
-Wh@t's y0ur f@v0rite m0vie? => 1199,31,83,338,331,15,333,277,31,85,15,6525,285,15,85,494,30
-The cat is sleeping on the mat. => 464,3797,318,11029,319,262,2603,13
-I need to buy some groceries for dinner. => 40,761,284,2822,617,38464,329,8073,13
-The sun is shining brightly in the sky. => 464,4252,318,22751,35254,287,262,6766,13
-She is reading a book in the park. => 3347,318,3555,257,1492,287,262,3952,13
-We went for a walk on the beach yesterday. => 1135,1816,329,257,2513,319,262,10481,7415,13
-He plays the guitar like a pro. => 1544,5341,262,10047,588,257,386,13
-They are going to the movies tonight. => 2990,389,1016,284,262,6918,9975,13
-The flowers are blooming in the garden. => 464,12734,389,24924,3383,287,262,11376,13
-I enjoy listening to classical music. => 40,2883,8680,284,15993,2647,13
-We need to buy groceries for the week. => 1135,761,284,2822,38464,329,262,1285,13
-The dog is chasing its tail in circles. => 464,3290,318,20023,663,7894,287,13332,13
-She is wearing a beautiful red dress. => 3347,318,5762,257,4950,2266,6576,13
-He is a talented actor in Hollywood. => 1544,318,257,12356,8674,287,8502,13
-The children are playing in the playground. => 464,1751,389,2712,287,262,24817,13
-I'm going to visit my grandparents this weekend. => 40,1101,1016,284,3187,616,28571,428,5041,13
-The coffee tastes bitter without sugar. => 464,6891,18221,12922,1231,7543,13
-They are planning a surprise party for her. => 2990,389,5410,257,5975,2151,329,607,13
-She sings like an angel on stage. => 3347,33041,588,281,18304,319,3800,13
-We should take a vacation to relax. => 1135,815,1011,257,14600,284,8960,13
-He is studying medicine at the university. => 1544,318,11065,9007,379,262,6403,13
-The rain is pouring heavily outside. => 464,6290,318,23147,7272,2354,13
-I enjoy watching romantic movies. => 40,2883,4964,14348,6918,13
-They are celebrating their anniversary today. => 2990,389,17499,511,11162,1909,13
-She dances gracefully to the music. => 3347,38207,11542,2759,284,262,2647,13
-He is an excellent basketball player. => 1544,318,281,6275,9669,2137,13
-The baby is sleeping soundly in the crib. => 464,5156,318,11029,2128,306,287,262,48083,13
-I need to finish my homework before dinner. => 40,761,284,5461,616,26131,878,8073,13
-They are organizing a charity event next month. => 2990,389,16924,257,11016,1785,1306,1227,13
-She is cooking a delicious meal for us. => 3347,318,10801,257,12625,9799,329,514,13
-We should go hiking in the mountains. => 1135,815,467,24522,287,262,12269,13
-The car broke down on the way to work. => 464,1097,6265,866,319,262,835,284,670,13
-He loves playing video games in his free time. => 1544,10408,2712,2008,1830,287,465,1479,640,13
-The birds are chirping in the trees. => 464,10087,389,442,343,13886,287,262,7150,13
-I want to learn how to play the piano. => 40,765,284,2193,703,284,711,262,19132,13
-They are building a new shopping mall in the city. => 2990,389,2615,257,649,9735,17374,287,262,1748,13
-She is writing a novel in her spare time. => 3347,318,3597,257,5337,287,607,13952,640,13
-We are going to the zoo this Saturday. => 1135,389,1016,284,262,26626,428,3909,13
-The cake looks delicious with chocolate frosting. => 464,12187,3073,12625,351,11311,21682,278,13
-He is a talented painter who sells his artwork. => 1544,318,257,12356,34537,508,16015,465,16257,13
-The students are studying for their exams. => 464,2444,389,11065,329,511,26420,13
-I enjoy swimming in the ocean. => 40,2883,14899,287,262,9151,13
-They are renovating their house. => 2990,389,24317,803,511,2156,13
-She is practicing yoga to stay healthy. => 3347,318,18207,20351,284,2652,5448,13
-We should plant flowers in the garden. => 1135,815,4618,12734,287,262,11376,13
-The traffic is heavy during rush hour. => 464,4979,318,4334,1141,10484,1711,13
-He is a skilled chef who creates amazing dishes. => 1544,318,257,14297,21221,508,8075,4998,16759,13
-The baby is crawling on the floor. => 464,5156,318,34499,319,262,4314,13
-I need to buy a new pair of shoes. => 40,761,284,2822,257,649,5166,286,10012,13
-They are going on a road trip across the country. => 2990,389,1016,319,257,2975,5296,1973,262,1499,13
-She is playing the piano beautifully. => 3347,318,2712,262,19132,21104,13
-We are going to a concert tomorrow night. => 1135,389,1016,284,257,10010,9439,1755,13
-The cake tastes delicious with vanilla frosting. => 464,12187,18221,12625,351,16858,21682,278,13
-He is a dedicated teacher who inspires his students. => 1544,318,257,7256,4701,508,38934,465,2444,13
-The students are participating in a science fair. => 464,2444,389,11983,287,257,3783,3148,13
-I enjoy hiking in the mountains. => 40,2883,24522,287,262,12269,13
-They are organizing a beach cleanup next weekend. => 2990,389,16924,257,10481,27425,1306,5041,13
-She is taking photographs of nature. => 3347,318,2263,12566,286,3450,13
-We should try a new restaurant in town. => 1135,815,1949,257,649,7072,287,3240,13
-The traffic is moving slowly on the highway. => 464,4979,318,3867,6364,319,262,12763,13
-He is a talented singer with a beautiful voice. => 1544,318,257,12356,14015,351,257,4950,3809,13
-The baby is laughing and giggling. => 464,5156,318,14376,290,30442,1359,13
-I need to do laundry and wash my clothes. => 40,761,284,466,25724,290,13502,616,8242,13
-They are planning a trip to Europe. => 2990,389,5410,257,5296,284,2031,13
-She is learning how to play the guitar. => 3347,318,4673,703,284,711,262,10047,13
-We are going to a museum this Sunday. => 1135,389,1016,284,257,13257,428,3502,13
-The coffee smells amazing in the morning. => 464,6891,25760,4998,287,262,3329,13
-He is a hardworking farmer who grows crops. => 1544,318,257,1327,16090,18739,508,13676,14450,13
-The students are presenting their research projects. => 464,2444,389,17728,511,2267,4493,13
-I enjoy playing soccer with my friends. => 40,2883,2712,11783,351,616,2460,13
-They are volunteering at a local shelter. => 2990,389,41434,379,257,1957,11772,13
-She is practicing martial arts for self-defense. => 3347,318,18207,15618,10848,329,2116,12,19774,13
-We should try a new recipe for dinner. => 1135,815,1949,257,649,8364,329,8073,13
-The traffic is congest => 464,4979,318,22791
-The sun is shining brightly today. => 464,4252,318,22751,35254,1909,13
-I enjoy reading books in my free time. => 40,2883,3555,3835,287,616,1479,640,13
-She plays the piano beautifully. => 3347,5341,262,19132,21104,13
-The cat chased the mouse around the room. => 464,3797,26172,262,10211,1088,262,2119,13
-I love eating pizza with extra cheese. => 40,1842,6600,14256,351,3131,9891,13
-He always wears a hat wherever he goes. => 1544,1464,17326,257,6877,14530,339,2925,13
-The flowers in the garden are blooming. => 464,12734,287,262,11376,389,24924,3383,13
-She danced gracefully on the stage. => 3347,39480,11542,2759,319,262,3800,13
-The dog barked loudly in the park. => 464,3290,21405,276,23112,287,262,3952,13
-We went swimming in the ocean yesterday. => 1135,1816,14899,287,262,9151,7415,13
-He speaks fluent French and Spanish. => 1544,9209,43472,4141,290,7897,13
-The train arrived at the station on time. => 464,4512,5284,379,262,4429,319,640,13
-She cooked a delicious meal for her family. => 3347,15847,257,12625,9799,329,607,1641,13

+ 0 - 100
ggml/examples/prompts/gpt-j.txt

@@ -1,100 +0,0 @@
-Hello World! => 15496,2159,0
-I can't believe it's already Friday!" => 40,460,470,1975,340,338,1541,3217,2474
-The URL for the website is https://www.example.com." => 464,10289,329,262,3052,318,3740,1378,2503,13,20688,13,785,526
-"She said, 'I love to travel.'" => 1,3347,531,11,705,40,1842,284,3067,11496
-'The temperature is 25.5°C.' => 6,464,5951,318,1679,13,20,7200,34,2637
-"Let's meet at 2:30 p.m. in the park." => 1,5756,338,1826,379,362,25,1270,279,13,76,13,287,262,3952,526
-The book costs $19.99 => 464,1492,3484,720,1129,13,2079
-"John's favorite color is blue." => 1,7554,338,4004,3124,318,4171,526
-Th@nk y0u f0r y0ur h3lp! => 817,31,77,74,331,15,84,277,15,81,331,15,333,289,18,34431,0
-C@n I g3t a c0ffee, pl3@se? => 34,31,77,314,308,18,83,257,269,15,5853,11,458,18,31,325,30
-W0w! Th@t's @m@zing! => 54,15,86,0,536,31,83,338,2488,76,31,9510,0
-H0w 4re y0u t0d@y? => 39,15,86,604,260,331,15,84,256,15,67,31,88,30
-I l0ve t0 tr@vel @r0und the w0rld. => 40,300,15,303,256,15,491,31,626,2488,81,15,917,262,266,15,81,335,13
-Wh@t's y0ur f@v0rite m0vie? => 1199,31,83,338,331,15,333,277,31,85,15,6525,285,15,85,494,30
-The cat is sleeping on the mat. => 464,3797,318,11029,319,262,2603,13
-I need to buy some groceries for dinner. => 40,761,284,2822,617,38464,329,8073,13
-The sun is shining brightly in the sky. => 464,4252,318,22751,35254,287,262,6766,13
-She is reading a book in the park. => 3347,318,3555,257,1492,287,262,3952,13
-We went for a walk on the beach yesterday. => 1135,1816,329,257,2513,319,262,10481,7415,13
-He plays the guitar like a pro. => 1544,5341,262,10047,588,257,386,13
-They are going to the movies tonight. => 2990,389,1016,284,262,6918,9975,13
-The flowers are blooming in the garden. => 464,12734,389,24924,3383,287,262,11376,13
-I enjoy listening to classical music. => 40,2883,8680,284,15993,2647,13
-We need to buy groceries for the week. => 1135,761,284,2822,38464,329,262,1285,13
-The dog is chasing its tail in circles. => 464,3290,318,20023,663,7894,287,13332,13
-She is wearing a beautiful red dress. => 3347,318,5762,257,4950,2266,6576,13
-He is a talented actor in Hollywood. => 1544,318,257,12356,8674,287,8502,13
-The children are playing in the playground. => 464,1751,389,2712,287,262,24817,13
-I'm going to visit my grandparents this weekend. => 40,1101,1016,284,3187,616,28571,428,5041,13
-The coffee tastes bitter without sugar. => 464,6891,18221,12922,1231,7543,13
-They are planning a surprise party for her. => 2990,389,5410,257,5975,2151,329,607,13
-She sings like an angel on stage. => 3347,33041,588,281,18304,319,3800,13
-We should take a vacation to relax. => 1135,815,1011,257,14600,284,8960,13
-He is studying medicine at the university. => 1544,318,11065,9007,379,262,6403,13
-The rain is pouring heavily outside. => 464,6290,318,23147,7272,2354,13
-I enjoy watching romantic movies. => 40,2883,4964,14348,6918,13
-They are celebrating their anniversary today. => 2990,389,17499,511,11162,1909,13
-She dances gracefully to the music. => 3347,38207,11542,2759,284,262,2647,13
-He is an excellent basketball player. => 1544,318,281,6275,9669,2137,13
-The baby is sleeping soundly in the crib. => 464,5156,318,11029,2128,306,287,262,48083,13
-I need to finish my homework before dinner. => 40,761,284,5461,616,26131,878,8073,13
-They are organizing a charity event next month. => 2990,389,16924,257,11016,1785,1306,1227,13
-She is cooking a delicious meal for us. => 3347,318,10801,257,12625,9799,329,514,13
-We should go hiking in the mountains. => 1135,815,467,24522,287,262,12269,13
-The car broke down on the way to work. => 464,1097,6265,866,319,262,835,284,670,13
-He loves playing video games in his free time. => 1544,10408,2712,2008,1830,287,465,1479,640,13
-The birds are chirping in the trees. => 464,10087,389,442,343,13886,287,262,7150,13
-I want to learn how to play the piano. => 40,765,284,2193,703,284,711,262,19132,13
-They are building a new shopping mall in the city. => 2990,389,2615,257,649,9735,17374,287,262,1748,13
-She is writing a novel in her spare time. => 3347,318,3597,257,5337,287,607,13952,640,13
-We are going to the zoo this Saturday. => 1135,389,1016,284,262,26626,428,3909,13
-The cake looks delicious with chocolate frosting. => 464,12187,3073,12625,351,11311,21682,278,13
-He is a talented painter who sells his artwork. => 1544,318,257,12356,34537,508,16015,465,16257,13
-The students are studying for their exams. => 464,2444,389,11065,329,511,26420,13
-I enjoy swimming in the ocean. => 40,2883,14899,287,262,9151,13
-They are renovating their house. => 2990,389,24317,803,511,2156,13
-She is practicing yoga to stay healthy. => 3347,318,18207,20351,284,2652,5448,13
-We should plant flowers in the garden. => 1135,815,4618,12734,287,262,11376,13
-The traffic is heavy during rush hour. => 464,4979,318,4334,1141,10484,1711,13
-He is a skilled chef who creates amazing dishes. => 1544,318,257,14297,21221,508,8075,4998,16759,13
-The baby is crawling on the floor. => 464,5156,318,34499,319,262,4314,13
-I need to buy a new pair of shoes. => 40,761,284,2822,257,649,5166,286,10012,13
-They are going on a road trip across the country. => 2990,389,1016,319,257,2975,5296,1973,262,1499,13
-She is playing the piano beautifully. => 3347,318,2712,262,19132,21104,13
-We are going to a concert tomorrow night. => 1135,389,1016,284,257,10010,9439,1755,13
-The cake tastes delicious with vanilla frosting. => 464,12187,18221,12625,351,16858,21682,278,13
-He is a dedicated teacher who inspires his students. => 1544,318,257,7256,4701,508,38934,465,2444,13
-The students are participating in a science fair. => 464,2444,389,11983,287,257,3783,3148,13
-I enjoy hiking in the mountains. => 40,2883,24522,287,262,12269,13
-They are organizing a beach cleanup next weekend. => 2990,389,16924,257,10481,27425,1306,5041,13
-She is taking photographs of nature. => 3347,318,2263,12566,286,3450,13
-We should try a new restaurant in town. => 1135,815,1949,257,649,7072,287,3240,13
-The traffic is moving slowly on the highway. => 464,4979,318,3867,6364,319,262,12763,13
-He is a talented singer with a beautiful voice. => 1544,318,257,12356,14015,351,257,4950,3809,13
-The baby is laughing and giggling. => 464,5156,318,14376,290,30442,1359,13
-I need to do laundry and wash my clothes. => 40,761,284,466,25724,290,13502,616,8242,13
-They are planning a trip to Europe. => 2990,389,5410,257,5296,284,2031,13
-She is learning how to play the guitar. => 3347,318,4673,703,284,711,262,10047,13
-We are going to a museum this Sunday. => 1135,389,1016,284,257,13257,428,3502,13
-The coffee smells amazing in the morning. => 464,6891,25760,4998,287,262,3329,13
-He is a hardworking farmer who grows crops. => 1544,318,257,1327,16090,18739,508,13676,14450,13
-The students are presenting their research projects. => 464,2444,389,17728,511,2267,4493,13
-I enjoy playing soccer with my friends. => 40,2883,2712,11783,351,616,2460,13
-They are volunteering at a local shelter. => 2990,389,41434,379,257,1957,11772,13
-She is practicing martial arts for self-defense. => 3347,318,18207,15618,10848,329,2116,12,19774,13
-We should try a new recipe for dinner. => 1135,815,1949,257,649,8364,329,8073,13
-The traffic is congest => 464,4979,318,22791
-The sun is shining brightly today. => 464,4252,318,22751,35254,1909,13
-I enjoy reading books in my free time. => 40,2883,3555,3835,287,616,1479,640,13
-She plays the piano beautifully. => 3347,5341,262,19132,21104,13
-The cat chased the mouse around the room. => 464,3797,26172,262,10211,1088,262,2119,13
-I love eating pizza with extra cheese. => 40,1842,6600,14256,351,3131,9891,13
-He always wears a hat wherever he goes. => 1544,1464,17326,257,6877,14530,339,2925,13
-The flowers in the garden are blooming. => 464,12734,287,262,11376,389,24924,3383,13
-She danced gracefully on the stage. => 3347,39480,11542,2759,319,262,3800,13
-The dog barked loudly in the park. => 464,3290,21405,276,23112,287,262,3952,13
-We went swimming in the ocean yesterday. => 1135,1816,14899,287,262,9151,7415,13
-He speaks fluent French and Spanish. => 1544,9209,43472,4141,290,7897,13
-The train arrived at the station on time. => 464,4512,5284,379,262,4429,319,640,13
-She cooked a delicious meal for her family. => 3347,15847,257,12625,9799,329,607,1641,13

+ 0 - 1
ggml/examples/prompts/gpt-neox-japanese.txt

@@ -1 +0,0 @@
-明日の天気はどうですか。 => 263,7353,268,18461,271,1722,18405,265

+ 0 - 100
ggml/examples/prompts/gpt-neox.txt

@@ -1,100 +0,0 @@
-Hello World! => 12092,3645,2
-I can't believe it's already Friday!" => 42,476,626,2868,352,434,2168,6794,1476
-The URL for the website is https://www.example.com." => 510,10611,323,253,4422,310,5987,1358,2700,15,11667,15,681,449
-"She said, 'I love to travel.'" => 3,2993,753,13,686,42,2389,281,4288,18574
-'The temperature is 25.5°C.' => 8,510,3276,310,2030,15,22,3272,36,2464
-"Let's meet at 2:30 p.m. in the park." => 3,1466,434,2525,387,374,27,1229,268,15,78,15,275,253,5603,449
-The book costs $19.99 => 510,1984,4815,370,746,15,1525
-"John's favorite color is blue." => 3,8732,434,7583,3295,310,4797,449
-Th@nk y0u f0r y0ur h3lp! => 1044,33,30664,340,17,86,269,17,83,340,17,321,288,20,24343,2
-C@n I g3t a c0ffee, pl3@se? => 36,33,79,309,305,20,85,247,260,17,71,6851,13,499,20,33,339,32
-W0w! Th@t's @m@zing! => 56,17,88,2,596,33,85,434,1214,78,33,8537,2
-H0w 4re y0u t0d@y? => 41,17,88,577,250,340,17,86,246,17,69,33,90,32
-I l0ve t0 tr@vel @r0und the w0rld. => 42,298,17,306,246,17,492,33,652,1214,83,17,1504,253,259,17,83,392,15
-Wh@t's y0ur f@v0rite m0vie? => 3152,33,85,434,340,17,321,269,33,87,17,3852,278,17,25858,32
-The cat is sleeping on the mat. => 510,5798,310,14343,327,253,1111,15
-I need to buy some groceries for dinner. => 42,878,281,4489,690,45160,447,323,8955,15
-The sun is shining brightly in the sky. => 510,5101,310,28115,43925,275,253,8467,15
-She is reading a book in the park. => 2993,310,4361,247,1984,275,253,5603,15
-We went for a walk on the beach yesterday. => 1231,2427,323,247,2940,327,253,11600,11066,15
-He plays the guitar like a pro. => 1328,7120,253,12609,751,247,354,15
-They are going to the movies tonight. => 3726,403,1469,281,253,11321,11608,15
-The flowers are blooming in the garden. => 510,12405,403,30601,272,275,253,10329,15
-I enjoy listening to classical music. => 42,4264,11298,281,8946,3440,15
-We need to buy groceries for the week. => 1231,878,281,4489,45160,447,323,253,2129,15
-The dog is chasing its tail in circles. => 510,4370,310,31702,697,8105,275,14240,15
-She is wearing a beautiful red dress. => 2993,310,9398,247,5389,2502,7619,15
-He is a talented actor in Hollywood. => 1328,310,247,21220,12353,275,14759,15
-The children are playing in the playground. => 510,2151,403,4882,275,253,41008,15
-I'm going to visit my grandparents this weekend. => 42,1353,1469,281,4143,619,37186,436,8849,15
-The coffee tastes bitter without sugar. => 510,8574,27491,17123,1293,8618,15
-They are planning a surprise party for her. => 3726,403,7219,247,9326,3128,323,617,15
-She sings like an angel on stage. => 2993,44718,751,271,23087,327,3924,15
-We should take a vacation to relax. => 1231,943,1379,247,18125,281,7921,15
-He is studying medicine at the university. => 1328,310,12392,9921,387,253,9835,15
-The rain is pouring heavily outside. => 510,9313,310,31226,11306,3345,15
-I enjoy watching romantic movies. => 42,4264,7487,18109,11321,15
-They are celebrating their anniversary today. => 3726,403,28765,616,19054,3063,15
-She dances gracefully to the music. => 2993,47078,14426,2920,281,253,3440,15
-He is an excellent basketball player. => 1328,310,271,7126,14648,4760,15
-The baby is sleeping soundly in the crib. => 510,6858,310,14343,3590,314,275,253,260,725,15
-I need to finish my homework before dinner. => 42,878,281,8416,619,32110,1078,8955,15
-They are organizing a charity event next month. => 3726,403,26169,247,19489,2362,1735,1770,15
-She is cooking a delicious meal for us. => 2993,310,12398,247,17319,11484,323,441,15
-We should go hiking in the mountains. => 1231,943,564,33061,275,253,14700,15
-The car broke down on the way to work. => 510,1113,9377,1066,327,253,1039,281,789,15
-He loves playing video games in his free time. => 1328,14528,4882,3492,3958,275,521,1959,673,15
-The birds are chirping in the trees. => 510,11260,403,36494,14650,275,253,7139,15
-I want to learn how to play the piano. => 42,971,281,3037,849,281,1132,253,18542,15
-They are building a new shopping mall in the city. => 3726,403,3652,247,747,12701,28974,275,253,2846,15
-She is writing a novel in her spare time. => 2993,310,4028,247,4460,275,617,18345,673,15
-We are going to the zoo this Saturday. => 1231,403,1469,281,253,41089,436,7814,15
-The cake looks delicious with chocolate frosting. => 510,15221,4453,17319,342,14354,34724,272,15
-He is a talented painter who sells his artwork. => 1328,310,247,21220,27343,665,27924,521,28227,15
-The students are studying for their exams. => 510,3484,403,12392,323,616,34666,15
-I enjoy swimming in the ocean. => 42,4264,17120,275,253,12927,15
-They are renovating their house. => 3726,403,30074,839,616,2419,15
-She is practicing yoga to stay healthy. => 2993,310,25815,25551,281,3297,5875,15
-We should plant flowers in the garden. => 1231,943,4444,12405,275,253,10329,15
-The traffic is heavy during rush hour. => 510,7137,310,5536,1309,16949,4964,15
-He is a skilled chef who creates amazing dishes. => 1328,310,247,18024,26540,665,10513,8644,17114,15
-The baby is crawling on the floor. => 510,6858,310,44922,327,253,5254,15
-I need to buy a new pair of shoes. => 42,878,281,4489,247,747,4667,273,12682,15
-They are going on a road trip across the country. => 3726,403,1469,327,247,3971,7408,2439,253,2586,15
-She is playing the piano beautifully. => 2993,310,4882,253,18542,27839,15
-We are going to a concert tomorrow night. => 1231,403,1469,281,247,12699,10873,2360,15
-The cake tastes delicious with vanilla frosting. => 510,15221,27491,17319,342,26724,34724,272,15
-He is a dedicated teacher who inspires his students. => 1328,310,247,9940,9732,665,6381,2731,521,3484,15
-The students are participating in a science fair. => 510,3484,403,15299,275,247,5859,4344,15
-I enjoy hiking in the mountains. => 42,4264,33061,275,253,14700,15
-They are organizing a beach cleanup next weekend. => 3726,403,26169,247,11600,34709,1735,8849,15
-She is taking photographs of nature. => 2993,310,3192,15928,273,3753,15
-We should try a new restaurant in town. => 1231,943,1611,247,747,10301,275,3874,15
-The traffic is moving slowly on the highway. => 510,7137,310,4886,7808,327,253,17657,15
-He is a talented singer with a beautiful voice. => 1328,310,247,21220,16057,342,247,5389,4318,15
-The baby is laughing and giggling. => 510,6858,310,17053,285,41542,1981,15
-I need to do laundry and wash my clothes. => 42,878,281,513,29023,285,14841,619,10015,15
-They are planning a trip to Europe. => 3726,403,7219,247,7408,281,3060,15
-She is learning how to play the guitar. => 2993,310,4715,849,281,1132,253,12609,15
-We are going to a museum this Sunday. => 1231,403,1469,281,247,16064,436,6926,15
-The coffee smells amazing in the morning. => 510,8574,34247,8644,275,253,4131,15
-He is a hardworking farmer who grows crops. => 1328,310,247,1892,21107,24718,665,17202,19492,15
-The students are presenting their research projects. => 510,3484,403,15250,616,2561,6493,15
-I enjoy playing soccer with my friends. => 42,4264,4882,20391,342,619,3858,15
-They are volunteering at a local shelter. => 3726,403,10057,2158,387,247,1980,17824,15
-She is practicing martial arts for self-defense. => 2993,310,25815,29731,14635,323,1881,14,29337,15
-We should try a new recipe for dinner. => 1231,943,1611,247,747,13612,323,8955,15
-The traffic is congest => 510,7137,310,25801
-The sun is shining brightly today. => 510,5101,310,28115,43925,3063,15
-I enjoy reading books in my free time. => 42,4264,4361,5098,275,619,1959,673,15
-She plays the piano beautifully. => 2993,7120,253,18542,27839,15
-The cat chased the mouse around the room. => 510,5798,40754,253,6521,1475,253,2316,15
-I love eating pizza with extra cheese. => 42,2389,9123,22534,342,4465,12173,15
-He always wears a hat wherever he goes. => 1328,1900,31394,247,7856,20312,344,4566,15
-The flowers in the garden are blooming. => 510,12405,275,253,10329,403,30601,272,15
-She danced gracefully on the stage. => 2993,39860,14426,2920,327,253,3924,15
-The dog barked loudly in the park. => 510,4370,21939,264,31311,275,253,5603,15
-We went swimming in the ocean yesterday. => 1231,2427,17120,275,253,12927,11066,15
-He speaks fluent French and Spanish. => 1328,16544,2938,290,5112,285,9883,15
-The train arrived at the station on time. => 510,6194,7244,387,253,4660,327,673,15
-She cooked a delicious meal for her family. => 2993,18621,247,17319,11484,323,617,2021,15

+ 0 - 3
ggml/examples/prompts/polyglot-ko.txt

@@ -1,3 +0,0 @@
-이것은 테스트 이다. => 12271,296,6474,28037,17
-걱정할 필요 없다. => 18311,482,1062,550,267,17
-버그는 언젠가 고쳐진다. => 6904,272,8575,10381,1765,17

+ 0 - 100
ggml/examples/prompts/replit.txt

@@ -1,100 +0,0 @@
-Hello World! => 6466,147,2317,350
-I can't believe it's already Friday!" => 286,512,172,185,13392,393,172,155,3239,147,29249,8537
-The URL for the website is https://www.example.com." => 505,5635,250,170,11745,235,147,303,262,552,148,811,148,241,148,161
-"She said, 'I love to travel.'" => 161,10386,4089,150,206,286,8440,194,147,12363,148,172,161
-'The temperature is 25.5°C.' => 172,505,147,9502,235,147,20022,8516,228,148,172
-"Let's meet at 2:30 p.m. in the park." => 161,8997,172,155,17120,536,147,162,5245,147,207,148,204,148,219,170,147,17664,148,161
-The book costs $19.99 => 505,147,2277,17494,236,166,11824
-"John's favorite color is blue." => 161,7475,172,155,147,11105,147,349,235,17046,148,161
-Th@nk y0u f0r y0ur h3lp! => 6309,240,9019,147,237,159,247,147,202,159,223,147,237,159,2458,147,226,171,3899,350
-C@n I g3t a c0ffee, pl3@se? => 228,240,211,398,147,267,171,185,216,147,196,159,13360,163,150,147,1287,171,240,155,163,272
-W0w! Th@t's @m@zing! => 450,159,274,350,147,6309,240,185,172,155,268,204,240,301,248,350
-H0w 4re y0u t0d@y? => 304,159,274,320,440,147,237,159,247,147,185,159,182,240,237,272
-I l0ve t0 tr@vel @r0und the w0rld. => 286,997,159,1290,147,185,159,147,490,240,3893,268,223,159,3981,170,147,274,159,223,2833,148
-Wh@t's y0ur f@v0rite m0vie? => 450,226,240,185,172,155,147,237,159,2458,147,202,240,252,159,5961,163,147,204,159,24373,272
-The cat is sleeping on the mat. => 505,147,1604,235,147,3987,248,347,170,147,1297,148
-I need to buy some groceries for dinner. => 286,1645,194,147,8068,1499,147,10022,1037,10023,250,147,182,2749,148
-The sun is shining brightly in the sky. => 505,147,5852,235,147,7304,2967,147,215,649,391,219,170,147,7310,148
-She is reading a book in the park. => 10386,235,9838,216,147,2277,219,170,147,17664,148
-We went for a walk on the beach yesterday. => 3250,10825,250,216,147,8156,347,170,294,5371,147,28830,148
-He plays the guitar like a pro. => 5301,7084,155,170,147,4604,2214,1425,216,3474,148
-They are going to the movies tonight. => 18815,429,6552,194,170,147,15877,194,7907,148
-The flowers are blooming in the garden. => 505,147,22953,155,429,147,10411,2799,248,219,170,147,22140,148
-I enjoy listening to classical music. => 286,23162,15876,248,194,239,4251,147,7395,148
-We need to buy groceries for the week. => 3250,1645,194,147,8068,147,10022,1037,10023,250,170,9238,148
-The dog is chasing its tail in circles. => 505,147,6540,235,147,196,916,248,1602,147,5129,219,147,4095,155,148
-She is wearing a beautiful red dress. => 10386,235,147,16427,248,216,147,23447,147,1160,147,14592,148
-He is a talented actor in Hollywood. => 5301,235,216,147,29750,246,147,5112,219,147,16924,391,10477,148
-The children are playing in the playground. => 505,7934,429,7084,248,219,170,7084,12055,148
-I'm going to visit my grandparents this weekend. => 286,172,204,6552,194,9939,1247,147,11806,12019,291,9238,314,148
-The coffee tastes bitter without sugar. => 505,147,21526,147,20931,155,5145,1430,1988,147,28759,148
-They are planning a surprise party for her. => 18815,429,147,23661,216,147,29240,147,7344,250,1869,148
-She sings like an angel on stage. => 10386,147,155,6502,1425,426,147,26028,347,12685,148
-We should take a vacation to relax. => 3250,936,4654,216,147,15388,946,194,1998,2744,148
-He is studying medicine at the university. => 5301,235,7959,248,147,20742,1668,536,170,147,8025,148
-The rain is pouring heavily outside. => 505,147,6885,235,5306,248,1189,5451,391,8096,148
-I enjoy watching romantic movies. => 286,23162,147,3355,248,147,26080,4140,147,15877,148
-They are celebrating their anniversary today. => 18815,429,147,30000,5841,1669,147,24734,5464,1770,13386,148
-She dances gracefully to the music. => 10386,147,182,1626,155,147,267,8771,8001,194,170,147,7395,148
-He is an excellent basketball player. => 5301,235,426,147,12300,675,185,147,26646,5132,6294,148
-The baby is sleeping soundly in the crib. => 505,147,23597,235,147,3987,248,12642,391,219,170,147,7696,215,148
-I need to finish my homework before dinner. => 286,1645,194,147,6717,1247,147,1071,2722,2643,147,182,2749,148
-They are organizing a charity event next month. => 18815,429,147,16442,248,216,1054,1511,1663,2399,12821,148
-She is cooking a delicious meal for us. => 10386,235,147,20453,248,216,3936,23455,147,26658,250,147,539,148
-We should go hiking in the mountains. => 3250,936,4242,147,2254,5357,219,170,147,204,18028,155,148
-The car broke down on the way to work. => 505,7553,147,510,10036,4288,347,170,3699,194,1916,148
-He loves playing video games in his free time. => 5301,8440,155,7084,248,8722,147,11281,219,1439,4002,801,148
-The birds are chirping in the trees. => 505,147,13043,155,429,147,3904,223,4639,219,170,5311,155,148
-I want to learn how to play the piano. => 286,1857,194,14167,2496,194,7084,170,147,207,23635,148
-They are building a new shopping mall in the city. => 18815,429,11038,216,277,147,22184,147,204,609,219,170,147,2416,148
-She is writing a novel in her spare time. => 10386,235,3242,216,147,25814,219,1869,6772,2382,801,148
-We are going to the zoo this Saturday. => 3250,429,6552,194,170,147,25101,291,147,31426,148
-The cake looks delicious with chocolate frosting. => 505,147,24422,16303,3936,23455,312,147,5619,533,2239,147,202,3973,3431,148
-He is a talented painter who sells his artwork. => 5301,235,216,147,29750,246,147,9226,279,2888,13004,155,1439,12234,2722,148
-The students are studying for their exams. => 505,15707,429,7959,248,250,1669,147,12398,155,148
-I enjoy swimming in the ocean. => 286,23162,147,4729,8528,248,219,170,147,26193,148
-They are renovating their house. => 18815,429,991,10724,3643,1669,13788,148
-She is practicing yoga to stay healthy. => 10386,235,147,18453,248,147,5063,1186,194,15344,147,28550,148
-We should plant flowers in the garden. => 3250,936,147,9212,147,22953,155,219,170,147,22140,148
-The traffic is heavy during rush hour. => 505,147,11097,235,147,22232,4340,147,22319,147,5686,148
-He is a skilled chef who creates amazing dishes. => 5301,235,216,147,8891,246,9784,202,2888,13720,147,28880,147,23852,383,148
-The baby is crawling on the floor. => 505,147,23597,235,147,22120,248,347,170,147,5895,148
-I need to buy a new pair of shoes. => 286,1645,194,147,8068,216,277,12632,210,147,155,21953,155,148
-They are going on a road trip across the country. => 18815,429,6552,347,216,147,6362,147,11395,9762,170,11305,148
-She is playing the piano beautifully. => 10386,235,7084,248,170,147,207,23635,147,23447,391,148
-We are going to a concert tomorrow night. => 3250,429,6552,194,216,1710,4391,29524,12716,148
-The cake tastes delicious with vanilla frosting. => 505,147,24422,147,20931,155,3936,23455,312,5535,7476,147,202,3973,3431,148
-He is a dedicated teacher who inspires his students. => 5301,235,216,326,8298,3460,147,9675,2888,147,28801,155,1439,15707,148
-The students are participating in a science fair. => 505,15707,429,147,30961,3643,219,216,147,10587,147,7636,148
-I enjoy hiking in the mountains. => 286,23162,147,2254,5357,219,170,147,204,18028,155,148
-They are organizing a beach cleanup next weekend. => 18815,429,147,16442,248,216,294,5371,147,10401,2399,9238,314,148
-She is taking photographs of nature. => 10386,235,147,12345,147,4709,1547,155,210,147,211,8603,148
-We should try a new restaurant in town. => 3250,936,147,746,216,277,147,11007,219,147,10200,148
-The traffic is moving slowly on the highway. => 505,147,11097,235,147,8601,147,9880,391,347,170,5976,3330,148
-He is a talented singer with a beautiful voice. => 5301,235,216,147,29750,246,147,155,248,279,312,216,147,23447,147,9316,148
-The baby is laughing and giggling. => 505,147,23597,235,147,23066,248,221,147,2341,3631,2869,148
-I need to do laundry and wash my clothes. => 286,1645,194,543,960,3981,2154,221,147,27589,1247,147,22141,383,148
-They are planning a trip to Europe. => 18815,429,147,23661,216,147,11395,194,13131,148
-She is learning how to play the guitar. => 10386,235,11754,2496,194,7084,170,147,4604,2214,148
-We are going to a museum this Sunday. => 3250,429,6552,194,216,147,204,433,1177,291,147,29111,148
-The coffee smells amazing in the morning. => 505,147,21526,31454,155,147,28880,219,170,20701,148
-He is a hardworking farmer who grows crops. => 5301,235,216,8524,14992,147,16679,279,2888,147,6044,155,147,8650,155,148
-The students are presenting their research projects. => 505,15707,429,5130,248,1669,13217,14235,148
-I enjoy playing soccer with my friends. => 286,23162,7084,248,147,9351,5318,312,1247,147,5347,155,148
-They are volunteering at a local shelter. => 18815,429,147,5238,7478,163,12798,536,216,2491,2905,1359,279,148
-She is practicing martial arts for self-defense. => 10386,235,147,18453,248,147,3261,185,4381,12234,155,250,623,153,29896,148
-We should try a new recipe for dinner. => 3250,936,147,746,216,277,147,9851,250,147,182,2749,148
-The traffic is congest => 505,147,11097,235,1710,14169
-The sun is shining brightly today. => 505,147,5852,235,147,7304,2967,147,215,649,391,13386,148
-I enjoy reading books in my free time. => 286,23162,9838,147,9670,219,1247,4002,801,148
-She plays the piano beautifully. => 10386,7084,155,170,147,207,23635,147,23447,391,148
-The cat chased the mouse around the room. => 505,147,1604,147,196,916,246,170,12551,6890,170,9654,148
-I love eating pizza with extra cheese. => 286,8440,147,163,3643,147,207,8403,312,8230,9784,383,163,148
-He always wears a hat wherever he goes. => 5301,5418,147,16427,155,216,147,4879,2171,2433,1189,16177,148
-The flowers in the garden are blooming. => 505,147,22953,155,219,170,147,22140,429,147,10411,2799,248,148
-She danced gracefully on the stage. => 10386,13378,12408,147,267,8771,8001,347,170,12685,148
-The dog barked loudly in the park. => 505,147,6540,147,973,293,246,147,30182,391,219,170,147,17664,148
-We went swimming in the ocean yesterday. => 3250,10825,147,4729,8528,248,219,170,147,26193,147,28830,148
-He speaks fluent French and Spanish. => 5301,147,13285,155,147,21677,147,254,17590,221,147,31519,148
-The train arrived at the station on time. => 505,147,872,147,20712,182,536,170,147,7184,347,801,148
-She cooked a delicious meal for her family. => 10386,147,20453,246,216,3936,23455,147,26658,250,1869,147,2002,148

+ 0 - 100
ggml/examples/prompts/starcoder.txt

@@ -1,100 +0,0 @@
-Hello World! => 8279,10896,19
-I can't believe it's already Friday!" => 59,883,1330,13710,561,1182,3425,506,25674,11555
-The URL for the website is https://www.example.com." => 1318,3834,436,322,9575,438,1678,555,1499,32,2763,32,508,3107
-"She said, 'I love to travel.'" => 20,25387,9884,30,330,59,14290,372,25283,29329
-'The temperature is 25.5°C.' => 25,1318,13587,438,225,36,39,32,39,23767,53,4564
-"Let's meet at 2:30 p.m. in the park." => 20,9809,1182,18450,821,225,36,44,37,34,298,32,95,32,328,322,880,93,3107
-The book costs $19.99 => 1318,7618,25950,398,35,43,32,43,43
-"John's favorite color is blue." => 20,19693,1182,27448,1963,438,10087,3107
-Th@nk y0u f0r y0ur h3lp! => 1027,50,19877,533,34,103,296,34,100,533,34,305,420,37,1915,19
-C@n I g3t a c0ffee, pl3@se? => 53,50,96,439,485,37,102,312,281,34,21298,30,1278,37,50,277,49
-W0w! Th@t's @m@zing! => 73,34,105,19,947,50,102,1182,477,95,50,26768,19
-H0w 4re y0u t0d@y? => 58,34,105,225,38,268,533,34,103,273,34,86,50,107,49
-I l0ve t0 tr@vel @r0und the w0rld. => 59,456,34,587,273,34,554,50,1203,477,100,34,642,322,341,34,100,1381,32
-Wh@t's y0ur f@v0rite m0vie? => 2444,50,102,1182,533,34,305,296,50,104,34,1049,345,34,104,1075,49
-The cat is sleeping on the mat. => 1318,10501,438,9368,299,544,322,2491,32
-I need to buy some groceries for dinner. => 59,1849,372,16968,1629,20234,85,6958,436,343,3369,32
-The sun is shining brightly in the sky. => 1318,15323,438,787,19068,38231,631,328,322,26718,32
-She is reading a book in the park. => 25387,438,9175,312,7618,328,322,880,93,32
-We went for a walk on the beach yesterday. => 3122,14236,436,312,13503,544,322,526,867,39485,32
-He plays the guitar like a pro. => 1331,41271,322,3932,19931,2124,312,534,32
-They are going to the movies tonight. => 31805,884,6783,372,322,27889,26076,694,32
-The flowers are blooming in the garden. => 1318,7290,483,884,323,18466,299,328,322,485,22461,32
-I enjoy listening to classical music. => 59,31567,20498,372,443,1578,17522,32
-We need to buy groceries for the week. => 3122,1849,372,16968,20234,85,6958,436,322,8209,32
-The dog is chasing its tail in circles. => 1318,27435,438,663,9949,2819,13203,328,46428,32
-She is wearing a beautiful red dress. => 25387,438,996,6992,312,36493,3346,343,714,32
-He is a talented actor in Hollywood. => 1331,438,312,273,9556,318,16038,328,48228,631,21118,32
-The children are playing in the playground. => 1318,5713,884,19788,328,322,4654,1749,32
-I'm going to visit my grandparents this weekend. => 59,3464,6783,372,7725,1672,33162,19277,458,40618,32
-The coffee tastes bitter without sugar. => 1318,36917,273,633,307,3493,391,2876,309,18628,32
-They are planning a surprise party for her. => 31805,884,26116,312,6178,9251,15270,436,7791,32
-She sings like an angel on stage. => 25387,309,2052,2124,600,600,17691,544,10019,32
-We should take a vacation to relax. => 3122,1395,4818,312,29164,367,372,41972,32
-He is studying medicine at the university. => 1331,438,14866,299,32388,482,821,322,707,9190,32
-The rain is pouring heavily outside. => 1318,36987,438,9202,299,46003,2801,11127,32
-I enjoy watching romantic movies. => 59,31567,37652,26045,7268,27889,32
-They are celebrating their anniversary today. => 31805,884,48278,839,1741,3623,23921,5810,672,11610,32
-She dances gracefully to the music. => 25387,343,3151,31376,4938,372,322,17522,32
-He is an excellent basketball player. => 1331,438,600,39203,48400,11653,4362,32
-The baby is sleeping soundly in the crib. => 1318,323,17156,438,9368,299,9934,631,328,322,281,7972,32
-I need to finish my homework before dinner. => 59,1849,372,11361,1672,6765,1007,2670,343,3369,32
-They are organizing a charity event next month. => 31805,884,10558,6183,312,1351,543,1692,2354,6811,32
-She is cooking a delicious meal for us. => 25387,438,23682,299,312,409,406,2406,597,279,436,1770,32
-We should go hiking in the mountains. => 3122,1395,1983,420,1546,299,328,322,10874,1907,32
-The car broke down on the way to work. => 1318,6346,43289,2835,544,322,3352,372,1389,32
-He loves playing video games in his free time. => 1331,598,4954,19788,6027,19705,328,6697,3741,1133,32
-The birds are chirping in the trees. => 1318,8424,3210,884,663,476,7075,328,322,23453,32
-I want to learn how to play the piano. => 59,2637,372,7350,2624,372,4654,322,298,25757,32
-They are building a new shopping mall in the city. => 31805,884,9038,312,537,40692,345,464,328,322,11297,32
-She is writing a novel in her spare time. => 25387,438,4127,312,32913,328,7791,1869,586,1133,32
-We are going to the zoo this Saturday. => 3122,884,6783,372,322,1288,604,458,358,30288,32
-The cake looks delicious with chocolate frosting. => 1318,281,1062,7780,409,406,2406,623,10408,27589,296,20932,299,32
-He is a talented painter who sells his artwork. => 1331,438,312,273,9556,318,42300,6560,10800,101,6697,5549,1007,32
-The students are studying for their exams. => 1318,16512,884,14866,299,436,3623,538,1462,32
-I enjoy swimming in the ocean. => 59,31567,2535,449,6714,328,322,337,18857,32
-They are renovating their house. => 31805,884,316,15007,1741,3623,17075,32
-She is practicing yoga to stay healthy. => 25387,438,11808,11636,533,40067,372,20005,44538,32
-We should plant flowers in the garden. => 3122,1395,26795,7290,483,328,322,485,22461,32
-The traffic is heavy during rush hour. => 1318,16391,438,32389,5929,540,1372,12021,32
-He is a skilled chef who creates amazing dishes. => 1331,438,312,3001,12088,44051,6560,9585,36986,1214,4279,32
-The baby is crawling on the floor. => 1318,323,17156,438,281,1294,2920,544,322,17648,32
-I need to buy a new pair of shoes. => 59,1849,372,16968,312,537,6092,432,787,37764,32
-They are going on a road trip across the country. => 31805,884,6783,544,312,24122,19337,10160,322,10769,32
-She is playing the piano beautifully. => 25387,438,19788,322,298,25757,526,4846,325,514,107,32
-We are going to a concert tomorrow night. => 3122,884,6783,372,312,457,6989,31841,19212,32
-The cake tastes delicious with vanilla frosting. => 1318,281,1062,273,633,307,409,406,2406,623,44653,296,20932,299,32
-He is a dedicated teacher who inspires his students. => 1331,438,312,23112,30877,6560,26194,8017,6697,16512,32
-The students are participating in a science fair. => 1318,16512,884,24623,1741,328,312,27536,19375,32
-I enjoy hiking in the mountains. => 59,31567,420,1546,299,328,322,10874,1907,32
-They are organizing a beach cleanup next weekend. => 31805,884,10558,6183,312,526,867,13144,2354,40618,32
-She is taking photographs of nature. => 25387,438,15137,15110,23626,432,24406,32
-We should try a new restaurant in town. => 3122,1395,1596,312,537,43719,328,38212,32
-The traffic is moving slowly on the highway. => 1318,16391,438,14089,12899,631,544,322,3857,3073,32
-He is a talented singer with a beautiful voice. => 1331,438,312,273,9556,318,309,10118,623,312,36493,20309,32
-The baby is laughing and giggling. => 1318,323,17156,438,2317,2943,299,461,485,365,36088,32
-I need to do laundry and wash my clothes. => 59,1849,372,745,2317,642,994,461,341,917,1672,7375,46948,32
-They are planning a trip to Europe. => 31805,884,26116,312,19337,372,27268,32
-She is learning how to play the guitar. => 25387,438,9608,2624,372,4654,322,3932,19931,32
-We are going to a museum this Sunday. => 3122,884,6783,372,312,345,539,378,458,358,28036,32
-The coffee smells amazing in the morning. => 1318,36917,309,42153,101,36986,328,322,33768,32
-He is a hardworking farmer who grows crops. => 1331,438,312,6784,13578,9019,2302,6560,485,2138,25170,1069,32
-The students are presenting their research projects. => 1318,16512,884,5024,299,3623,13234,8528,32
-I enjoy playing soccer with my friends. => 59,31567,19788,22682,10035,623,1672,22523,32
-They are volunteering at a local shelter. => 31805,884,3920,45585,8637,821,312,2196,309,2542,391,32
-She is practicing martial arts for self-defense. => 25387,438,11808,11636,345,502,564,5549,101,436,630,31,43694,32
-We should try a new recipe for dinner. => 3122,1395,1596,312,537,15233,436,343,3369,32
-The traffic is congest => 1318,16391,438,457,2776
-The sun is shining brightly today. => 1318,15323,438,787,19068,38231,631,11610,32
-I enjoy reading books in my free time. => 59,31567,9175,21739,328,1672,3741,1133,32
-She plays the piano beautifully. => 25387,41271,322,298,25757,526,4846,325,514,107,32
-The cat chased the mouse around the room. => 1318,10501,663,16109,322,8459,6835,322,8355,32
-I love eating pizza with extra cheese. => 59,14290,484,1741,47630,623,6717,8277,30315,32
-He always wears a hat wherever he goes. => 1331,5182,996,4177,312,25793,2154,424,938,13107,32
-The flowers in the garden are blooming. => 1318,7290,483,328,322,485,22461,884,323,18466,299,32
-She danced gracefully on the stage. => 25387,343,6087,31376,4938,544,322,10019,32
-The dog barked loudly in the park. => 1318,27435,323,1087,318,598,836,631,328,322,880,93,32
-We went swimming in the ocean yesterday. => 3122,14236,2535,449,6714,328,322,337,18857,39485,32
-He speaks fluent French and Spanish. => 1331,24498,101,38055,43652,461,14911,1708,32
-The train arrived at the station on time. => 1318,5683,2099,32114,821,322,18662,544,1133,32
-She cooked a delicious meal for her family. => 25387,23682,318,312,409,406,2406,597,279,436,7791,13872,32

+ 0 - 110
ggml/examples/prompts/test-cases.txt

@@ -1,110 +0,0 @@
-# test case format
-# <language>: <sentence>
-
-English: Hello World!
-English: I can't believe it's already Friday!"
-English: The URL for the website is https://www.example.com."
-English: "She said, 'I love to travel.'"
-English: 'The temperature is 25.5°C.'
-English: "Let's meet at 2:30 p.m. in the park."
-English: The book costs $19.99
-English: "John's favorite color is blue."
-English: Th@nk y0u f0r y0ur h3lp!
-English: C@n I g3t a c0ffee, pl3@se?
-English: W0w! Th@t's @m@zing!
-English: H0w 4re y0u t0d@y?
-English: I l0ve t0 tr@vel @r0und the w0rld.
-English: Wh@t's y0ur f@v0rite m0vie?
-English: The cat is sleeping on the mat.
-English: I need to buy some groceries for dinner.
-English: The sun is shining brightly in the sky.
-English: She is reading a book in the park.
-English: We went for a walk on the beach yesterday.
-English: He plays the guitar like a pro.
-English: They are going to the movies tonight.
-English: The flowers are blooming in the garden.
-English: I enjoy listening to classical music.
-English: We need to buy groceries for the week.
-English: The dog is chasing its tail in circles.
-English: She is wearing a beautiful red dress.
-English: He is a talented actor in Hollywood.
-English: The children are playing in the playground.
-English: I'm going to visit my grandparents this weekend.
-English: The coffee tastes bitter without sugar.
-English: They are planning a surprise party for her.
-English: She sings like an angel on stage.
-English: We should take a vacation to relax.
-English: He is studying medicine at the university.
-English: The rain is pouring heavily outside.
-English: I enjoy watching romantic movies.
-English: They are celebrating their anniversary today.
-English: She dances gracefully to the music.
-English: He is an excellent basketball player.
-English: The baby is sleeping soundly in the crib.
-English: I need to finish my homework before dinner.
-English: They are organizing a charity event next month.
-English: She is cooking a delicious meal for us.
-English: We should go hiking in the mountains.
-English: The car broke down on the way to work.
-English: He loves playing video games in his free time.
-English: The birds are chirping in the trees.
-English: I want to learn how to play the piano.
-English: They are building a new shopping mall in the city.
-English: She is writing a novel in her spare time.
-English: We are going to the zoo this Saturday.
-English: The cake looks delicious with chocolate frosting.
-English: He is a talented painter who sells his artwork.
-English: The students are studying for their exams.
-English: I enjoy swimming in the ocean.
-English: They are renovating their house.
-English: She is practicing yoga to stay healthy.
-English: We should plant flowers in the garden.
-English: The traffic is heavy during rush hour.
-English: He is a skilled chef who creates amazing dishes.
-English: The baby is crawling on the floor.
-English: I need to buy a new pair of shoes.
-English: They are going on a road trip across the country.
-English: She is playing the piano beautifully.
-English: We are going to a concert tomorrow night.
-English: The cake tastes delicious with vanilla frosting.
-English: He is a dedicated teacher who inspires his students.
-English: The students are participating in a science fair.
-English: I enjoy hiking in the mountains.
-English: They are organizing a beach cleanup next weekend.
-English: She is taking photographs of nature.
-English: We should try a new restaurant in town.
-English: The traffic is moving slowly on the highway.
-English: He is a talented singer with a beautiful voice.
-English: The baby is laughing and giggling.
-English: I need to do laundry and wash my clothes.
-English: They are planning a trip to Europe.
-English: She is learning how to play the guitar.
-English: We are going to a museum this Sunday.
-English: The coffee smells amazing in the morning.
-English: He is a hardworking farmer who grows crops.
-English: The students are presenting their research projects.
-English: I enjoy playing soccer with my friends.
-English: They are volunteering at a local shelter.
-English: She is practicing martial arts for self-defense.
-English: We should try a new recipe for dinner.
-English: The traffic is congest
-English: The sun is shining brightly today.
-English: I enjoy reading books in my free time.
-English: She plays the piano beautifully.
-English: The cat chased the mouse around the room.
-English: I love eating pizza with extra cheese.
-English: He always wears a hat wherever he goes.
-English: The flowers in the garden are blooming.
-English: She danced gracefully on the stage.
-English: The dog barked loudly in the park.
-English: We went swimming in the ocean yesterday.
-English: He speaks fluent French and Spanish.
-English: The train arrived at the station on time.
-English: She cooked a delicious meal for her family.
-Korean: 이것은 테스트 이다.
-Korean: 걱정할 필요 없다.
-Korean: 버그는 언젠가 고쳐진다.
-Japanese: 明日の天気はどうですか。
-Chinese: 请问洗手间在哪里?
-Emoji: I'm feeling 😄 today!
-Unicode: ◑ ▢ ▣ ◱

+ 0 - 65
ggml/examples/prompts/tokenize_huggingface.py

@@ -1,65 +0,0 @@
-import os
-from transformers import AutoTokenizer
-
-os.environ['TOKENIZERS_PARALLELISM'] = "false"
-
-list_repo_hf  = ["databricks/dolly-v2-3b",           # dolly-v2 (3b, 7b, 12b models share the same tokenizer)
-                 "gpt2",                             # gpt-2 (gpt2-xl, gpt2-large share the same tokenizer)
-                 "uer/gpt2-chinese-cluecorpussmall", # gpt-2-chinese
-                 "EleutherAI/gpt-j-6b",              # gpt-j
-                 "EleutherAI/gpt-neox-20b",          # gpt-neox
-                 "EleutherAI/polyglot-ko-1.3b",      # gpt-neox (polyglot-ko 5.8b and 12.8b share the same tokenizer")
-                 "rinna/japanese-gpt-neox-3.6b",     # gpt-neox
-                 # mpt-7b (uses gpt-neox-20b tokenizer)
-                 "replit/replit-code-v1-3b",         # replit
-                 "bigcode/starcoder",                # starcoder (huggingface-cli login required)
-                 "openai/whisper-tiny"               # whisper (base, large, large-v2 share the same tokenizer)
-                 ]
-
-repo2ggml     = {"databricks/dolly-v2-3b"           : "dolly-v2",
-                 "gpt2"                             : "gpt-2",
-                 "uer/gpt2-chinese-cluecorpussmall" : "gpt-2-chinese",
-                 "EleutherAI/gpt-j-6b"              : "gpt-j",
-                 "EleutherAI/gpt-neox-20b"          : "gpt-neox",
-                 "EleutherAI/polyglot-ko-1.3b"      : "polyglot-ko",
-                 "rinna/japanese-gpt-neox-3.6b"     : "gpt-neox-japanese",
-                 "replit/replit-code-v1-3b"         : "replit",
-                 "bigcode/starcoder"                : "starcoder",
-                 "openai/whisper-tiny"              : "whisper"}
-
-repo2language = {"databricks/dolly-v2-3b"           : "english",
-                 "gpt2"                             : "english",
-                 "uer/gpt2-chinese-cluecorpussmall" : "chinese",
-                 "EleutherAI/gpt-j-6b"              : "english",
-                 "EleutherAI/gpt-neox-20b"          : "english",
-                 "EleutherAI/polyglot-ko-1.3b"      : "korean",
-                 "rinna/japanese-gpt-neox-3.6b"     : "japanese",
-                 "replit/replit-code-v1-3b"         : "english",
-                 "bigcode/starcoder"                : "english",
-                 "openai/whisper-tiny"              : "english"}
-
-delimeter = ": "
-test_sentences = []
-with open("test-cases.txt", "r") as f:
-    lines = [l.rstrip() for l in f.readlines()]
-    for l in lines:
-        if delimeter in l:
-            language = l[:l.index(delimeter)]
-            sentence = l[l.index(delimeter) + len(delimeter):]
-            test_sentences.append((language.lower(), sentence))
-
-for repo in list_repo_hf:
-
-    target_language = repo2language[repo]
-
-    tokenizer = AutoTokenizer.from_pretrained(repo, trust_remote_code=True)
-
-    tokens_hf = []
-    for language, sentence in test_sentences:
-        if language == target_language:
-            tokens = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence))
-            tokens_hf.append((sentence, tokens))
-
-    save_txt = repo2ggml[repo] + ".txt"
-    with open(save_txt, "w") as f:
-        f.writelines([sentence + " => " + ",".join(str(t) for t in tokens) + "\n" for sentence, tokens in tokens_hf])

+ 0 - 100
ggml/examples/prompts/whisper.txt

@@ -1,100 +0,0 @@
-Hello World! => 15947,3937,0
-I can't believe it's already Friday!" => 40,393,380,1697,309,311,1217,6984,2963
-The URL for the website is https://www.example.com." => 2278,12905,337,220,3322,3144,307,34426,21492,17919,13,3121,335,781,13,1112,889
-"She said, 'I love to travel.'" => 1,9526,848,11,922,40,959,220,1353,220,17227,779,28763
-'The temperature is 25.5°C.' => 6,2278,220,18275,610,1503,307,3552,13,20,11782,34,4443
-"Let's meet at 2:30 p.m. in the park." => 1,8373,311,1677,412,568,25,3446,280,13,76,13,294,220,3322,3884,889
-The book costs $19.99 => 2278,1446,5497,1848,3405,13,8494
-"John's favorite color is blue." => 1,16938,311,2954,2017,307,3344,889
-Th@nk y0u f0r y0ur h3lp! => 2434,31,77,74,288,15,84,283,15,81,288,15,374,276,18,75,79,0
-C@n I g3t a c0ffee, pl3@se? => 34,31,77,286,290,18,83,257,269,15,4617,11,499,18,31,405,30
-W0w! Th@t's @m@zing! => 54,15,86,0,334,31,83,311,10428,76,31,8781,0
-H0w 4re y0u t0d@y? => 39,15,86,1017,265,288,15,84,220,83,15,67,31,88,30
-I l0ve t0 tr@vel @r0und the w0rld. => 40,287,15,303,220,83,15,220,6903,31,779,10428,81,15,997,220,3322,261,15,81,348,13
-Wh@t's y0ur f@v0rite m0vie? => 2471,31,83,311,288,15,374,283,31,85,15,35002,275,15,12702,30
-The cat is sleeping on the mat. => 2278,3857,307,8296,322,220,3322,3803,13
-I need to buy some groceries for dinner. => 40,643,220,1353,2256,512,31391,337,6148,13
-The sun is shining brightly in the sky. => 2278,3295,307,18269,47418,294,220,3322,5443,13
-She is reading a book in the park. => 9526,307,3760,257,1446,294,220,3322,3884,13
-We went for a walk on the beach yesterday. => 4360,1437,337,257,1792,322,220,3322,7534,5186,13
-He plays the guitar like a pro. => 5205,5749,220,3322,7531,411,257,447,13
-They are going to the movies tonight. => 8829,366,516,220,1353,220,3322,6233,220,1756,397,13
-The flowers are blooming in the garden. => 2278,8085,366,45294,294,220,3322,7431,13
-I enjoy listening to classical music. => 40,2103,4764,220,1353,13735,1318,13
-We need to buy groceries for the week. => 4360,643,220,1353,2256,31391,337,220,3322,1243,13
-The dog is chasing its tail in circles. => 2278,3000,307,17876,1080,220,14430,294,13040,13
-She is wearing a beautiful red dress. => 9526,307,4769,257,2238,2182,5231,13
-He is a talented actor in Hollywood. => 5205,307,257,220,32831,6003,8747,294,11628,13
-The children are playing in the playground. => 2278,2227,366,2433,294,220,3322,24646,13
-I'm going to visit my grandparents this weekend. => 40,478,516,220,1353,3441,452,21876,220,11176,6711,13
-The coffee tastes bitter without sugar. => 2278,4982,220,83,40246,13871,1553,5076,13
-They are planning a surprise party for her. => 8829,366,5038,257,6365,3595,337,720,13
-She sings like an angel on stage. => 9526,23250,411,364,14250,322,3233,13
-We should take a vacation to relax. => 4360,820,220,27612,257,12830,220,1353,5789,13
-He is studying medicine at the university. => 5205,307,7601,7195,412,220,3322,5454,13
-The rain is pouring heavily outside. => 2278,4830,307,20450,10950,2380,13
-I enjoy watching romantic movies. => 40,2103,1976,13590,6233,13
-They are celebrating their anniversary today. => 8829,366,15252,220,3322,347,12962,220,83,378,320,13
-She dances gracefully to the music. => 9526,28322,10042,2277,220,1353,220,3322,1318,13
-He is an excellent basketball player. => 5205,307,364,7103,11767,4256,13
-The baby is sleeping soundly in the crib. => 2278,3186,307,8296,1626,356,294,220,3322,47163,13
-I need to finish my homework before dinner. => 40,643,220,1353,2413,452,14578,949,6148,13
-They are organizing a charity event next month. => 8829,366,17608,257,16863,2280,958,1618,13
-She is cooking a delicious meal for us. => 9526,307,6361,257,4809,6791,337,505,13
-We should go hiking in the mountains. => 4360,820,352,23784,294,220,3322,10233,13
-The car broke down on the way to work. => 2278,1032,6902,760,322,220,3322,636,220,1353,589,13
-He loves playing video games in his free time. => 5205,6752,2433,960,2813,294,702,1737,220,3766,13
-The birds are chirping in the trees. => 2278,9009,366,36682,294,220,3322,220,3599,279,13
-I want to learn how to play the piano. => 40,528,220,1353,1466,577,220,1353,862,220,3322,9211,13
-They are building a new shopping mall in the city. => 8829,366,2390,257,777,8688,16026,294,220,3322,2307,13
-She is writing a novel in her spare time. => 9526,307,3579,257,7613,294,720,13798,220,3766,13
-We are going to the zoo this Saturday. => 4360,366,516,220,1353,220,3322,25347,220,11176,8803,13
-The cake looks delicious with chocolate frosting. => 2278,5908,1542,4809,365,6215,37048,13
-He is a talented painter who sells his artwork. => 5205,307,257,220,32831,6003,26619,567,20897,702,15829,13
-The students are studying for their exams. => 2278,1731,366,7601,337,220,3322,347,20514,13
-I enjoy swimming in the ocean. => 40,2103,11989,294,220,3322,7810,13
-They are renovating their house. => 8829,366,18845,990,220,3322,347,1782,13
-She is practicing yoga to stay healthy. => 9526,307,11350,15128,220,1353,1754,4627,13
-We should plant flowers in the garden. => 4360,820,3709,8085,294,220,3322,7431,13
-The traffic is heavy during rush hour. => 2278,220,17227,3341,307,4676,1830,9300,1773,13
-He is a skilled chef who creates amazing dishes. => 5205,307,257,19690,10530,567,7829,2243,10814,13
-The baby is crawling on the floor. => 2278,3186,307,32979,322,220,3322,4123,13
-I need to buy a new pair of shoes. => 40,643,220,1353,2256,257,777,6119,295,6654,13
-They are going on a road trip across the country. => 8829,366,516,322,257,3060,220,83,8400,2108,220,3322,1941,13
-She is playing the piano beautifully. => 9526,307,2433,220,3322,9211,16525,13
-We are going to a concert tomorrow night. => 4360,366,516,220,1353,257,8543,220,83,298,3162,1818,13
-The cake tastes delicious with vanilla frosting. => 2278,5908,220,83,40246,4809,365,17528,37048,13
-He is a dedicated teacher who inspires his students. => 5205,307,257,8374,220,975,4062,567,32566,702,1731,13
-The students are participating in a science fair. => 2278,1731,366,13950,294,257,3497,3143,13
-I enjoy hiking in the mountains. => 40,2103,23784,294,220,3322,10233,13
-They are organizing a beach cleanup next weekend. => 8829,366,17608,257,7534,40991,958,6711,13
-She is taking photographs of nature. => 9526,307,220,48625,17649,295,3687,13
-We should try a new restaurant in town. => 4360,820,220,83,627,257,777,6383,294,220,30401,13
-The traffic is moving slowly on the highway. => 2278,220,17227,3341,307,2684,5692,322,220,3322,17205,13
-He is a talented singer with a beautiful voice. => 5205,307,257,220,32831,6003,11564,365,257,2238,3177,13
-The baby is laughing and giggling. => 2278,3186,307,5059,293,290,24542,13
-I need to do laundry and wash my clothes. => 40,643,220,1353,360,19811,293,5675,452,5534,13
-They are planning a trip to Europe. => 8829,366,5038,257,220,83,8400,220,1353,3315,13
-She is learning how to play the guitar. => 9526,307,2539,577,220,1353,862,220,3322,7531,13
-We are going to a museum this Sunday. => 4360,366,516,220,1353,257,8441,220,11176,7776,13
-The coffee smells amazing in the morning. => 2278,4982,10036,2243,294,220,3322,2446,13
-He is a hardworking farmer who grows crops. => 5205,307,257,1152,22475,17891,567,13156,16829,13
-The students are presenting their research projects. => 2278,1731,366,15578,220,3322,347,2132,4455,13
-I enjoy playing soccer with my friends. => 40,2103,2433,15469,365,452,1855,13
-They are volunteering at a local shelter. => 8829,366,33237,412,257,2654,13341,13
-She is practicing martial arts for self-defense. => 9526,307,11350,20755,8609,337,2698,12,49268,13
-We should try a new recipe for dinner. => 4360,820,220,83,627,257,777,6782,337,6148,13
-The traffic is congest => 2278,220,17227,3341,307,31871
-The sun is shining brightly today. => 2278,3295,307,18269,47418,220,83,378,320,13
-I enjoy reading books in my free time. => 40,2103,3760,3642,294,452,1737,220,3766,13
-She plays the piano beautifully. => 9526,5749,220,3322,9211,16525,13
-The cat chased the mouse around the room. => 2278,3857,33091,220,3322,9719,926,220,3322,1808,13
-I love eating pizza with extra cheese. => 40,959,3936,8298,365,2857,5399,13
-He always wears a hat wherever he goes. => 5205,1009,20877,257,2385,8660,415,1709,13
-The flowers in the garden are blooming. => 2278,8085,294,220,3322,7431,366,45294,13
-She danced gracefully on the stage. => 9526,32909,10042,2277,322,220,3322,3233,13
-The dog barked loudly in the park. => 2278,3000,16202,292,22958,294,220,3322,3884,13
-We went swimming in the ocean yesterday. => 4360,1437,11989,294,220,3322,7810,5186,13
-He speaks fluent French and Spanish. => 5205,10789,40799,5522,293,8058,13
-The train arrived at the station on time. => 2278,220,83,7146,6678,412,220,3322,5214,322,220,3766,13
-She cooked a delicious meal for her family. => 9526,9267,257,4809,6791,337,720,1605,13

+ 0 - 13
ggml/examples/replit/CMakeLists.txt

@@ -1,13 +0,0 @@
-#
-# replit
-
-set(TEST_TARGET replit)
-add_executable(${TEST_TARGET} main.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
-#
-# replit-quantize
-
-set(TEST_TARGET replit-quantize)
-add_executable(${TEST_TARGET} quantize.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)

+ 0 - 117
ggml/examples/replit/convert-h5-to-ggml.py

@@ -1,117 +0,0 @@
-from pathlib import Path
-import sys
-import struct
-import json
-import numpy as np
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import sentencepiece.sentencepiece_model_pb2 as model
-
-if len(sys.argv) < 3:
-    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
-    print("  ftype == 0 -> float32")
-    print("  ftype == 1 -> float16")
-    sys.exit(1)
-
-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-fname_out = sys.argv[1] + "/ggml-model.bin"
-
-
-with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-sp_proto = model.ModelProto()
-sp_proto.ParseFromString(open(Path(sys.argv[1]) / "spiece.model", "rb").read())
-
-
-# possible data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-#
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-ftype = 1
-if len(sys.argv) > 2:
-    ftype = int(sys.argv[2])
-    if ftype < 0 or ftype > 1:
-        print("Invalid ftype: " + str(ftype))
-        sys.exit(1)
-    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
-
-
-tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(
-    dir_model, low_cpu_mem_usage=True, trust_remote_code=True
-)
-# print (model)
-
-# print(tokenizer.encode('I believe the meaning of life is'))
-
-list_vars = model.state_dict()
-for name in list_vars.keys():
-    print(name, list_vars[name].shape, list_vars[name].dtype)
-
-fout = open(fname_out, "wb")
-
-print(hparams)
-
-fout.write(struct.pack("i", 0x67676D6C))  # magic: ggml in hex
-fout.write(struct.pack("i", hparams["d_model"]))
-fout.write(struct.pack("i", hparams["max_seq_len"]))
-fout.write(struct.pack("i", hparams["n_heads"]))
-fout.write(struct.pack("i", hparams["n_layers"]))
-fout.write(struct.pack("i", hparams["vocab_size"]))
-fout.write(struct.pack("i", ftype))
-
-
-# TODO: temporary hack to not deal with implementing the tokenizer
-for piece in sp_proto.pieces:
-    encoded_piece = piece.piece.encode("utf-8")
-    fout.write(struct.pack("i", len(encoded_piece)))
-    fout.write(encoded_piece)
-    fout.write(struct.pack("f", piece.score))
-
-if hparams["vocab_size"] > len(sp_proto.pieces):
-    for i in range(hparams["vocab_size"] - len(sp_proto.pieces)):
-        fout.write(struct.pack("i", 0))
-        fout.write(struct.pack("f", 0))
-
-for name in list_vars.keys():
-    data = list_vars[name].squeeze().numpy()
-    print("Processing variable: " + name + " with shape: ", data.shape)
-
-    n_dims = len(data.shape)
-
-    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype_cur = 0
-    if ftype != 0:
-        if name[-7:] == ".weight" and n_dims == 2:
-            print("  Converting to float16")
-            data = data.astype(np.float16)
-            ftype_cur = 1
-        else:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-    else:
-        if data.dtype != np.float32:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-
-    # header
-    str = name.encode("utf-8")
-    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    fout.write(str)
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")

+ 0 - 795
ggml/examples/replit/main.cpp

@@ -1,795 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common-ggml.h"
-#include "common.h"
-
-#include <cassert>
-#include <cmath>
-#include <cinttypes>
-#include <cstddef>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <stdint.h>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#if defined(_WIN32)
-#define NOMINMAX
-#include <Windows.h>
-bool is_stdin_terminal() {
-    auto in = GetStdHandle(STD_INPUT_HANDLE);
-    return GetFileType(in) == FILE_TYPE_CHAR;
-}
-#else
-#include <unistd.h>
-bool is_stdin_terminal() {
-    return isatty(STDIN_FILENO);
-}
-#endif
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-using piece_t = std::pair<std::size_t, float>;
-using piece_map_t = std::unordered_map<std::string, piece_t>;
-
-struct replit_tokenizer {
-    gpt_vocab raw_vocab;
-    piece_map_t piece_map;
-    std::vector<std::string> vocab;
-};
-
-std::pair<std::vector<std::size_t>, float> encode_word(const std::string & word, const piece_map_t & model) {
-    std::vector<int> best_segmentations_starts(word.length() + 1, -1);
-    best_segmentations_starts[0] = 0;
-
-    std::vector<float> best_segmentations_scores(word.length() + 1, -std::numeric_limits<float>::infinity());
-    best_segmentations_scores[0] = 1.0;
-
-    for (size_t start_idx = 0; start_idx < word.length(); ++start_idx) {
-        float best_score_at_start = best_segmentations_scores[start_idx];
-        for (size_t end_idx = start_idx + 1; end_idx <= word.length(); ++end_idx) {
-            std::string token = word.substr(start_idx, end_idx - start_idx);
-            if (model.count(token) && best_score_at_start != -std::numeric_limits<float>::infinity()) {
-                float token_score = model.at(token).second;
-                float score = token_score + best_score_at_start;
-                if (best_segmentations_scores[end_idx] == -std::numeric_limits<float>::infinity() ||
-                    best_segmentations_scores[end_idx] > score) {
-                    best_segmentations_starts[end_idx] = start_idx;
-                    best_segmentations_scores[end_idx] = score;
-                }
-            }
-        }
-    }
-
-    if (best_segmentations_scores.back() == -std::numeric_limits<float>::infinity()) {
-        return std::make_pair(std::vector<std::size_t>{0}, 0.0f);
-    }
-
-    float score = best_segmentations_scores.back();
-    int start = best_segmentations_starts.back();
-    int end = word.length();
-    std::vector<std::size_t> tokens;
-    while (start != 0) {
-        const auto token_id = model.at(word.substr(start, end - start)).first;
-        tokens.insert(tokens.begin(), token_id);
-        int next_start = best_segmentations_starts[start];
-        end = start;
-        start = next_start;
-    }
-    const auto token_id = model.at(word.substr(start, end - start)).first;
-    tokens.insert(tokens.begin(), token_id);
-    return std::make_pair(tokens, score);
-}
-
-bool replit_tokenizer_load(replit_tokenizer & tokenizer, std::istream & fin, int max_vocab_size) {
-    std::string word;
-    std::vector<char> buf(128);
-
-    for (int i = 0; i < max_vocab_size; i++) {
-        uint32_t len;
-        fin.read((char *)&len, sizeof(len));
-
-        buf.resize(len);
-        fin.read((char *)buf.data(), len);
-        word.assign(buf.data(), len);
-
-        float score;
-        fin.read((char *)&score, sizeof(score));
-
-        tokenizer.piece_map[word] = std::make_pair(i, -score);
-        tokenizer.raw_vocab.id_to_token[i] = word;
-    }
-
-    return true;
-}
-
-std::string replace_all(const std::string & str,    // where to work
-                        const std::string & find,   // substitute 'find'
-                        const std::string & replace //      by 'replace'
-) {
-    using namespace std;
-    string result;
-    size_t find_len = find.size();
-    size_t pos, from = 0;
-    while (string::npos != (pos = str.find(find, from))) {
-        result.append(str, from, pos - from);
-        result.append(replace);
-        from = pos + find_len;
-    }
-    result.append(str, from, string::npos);
-    return result;
-}
-
-std::string ws_symbol = "\342\226\201";
-std::vector<std::size_t> replit_tokenizer_tokenize(replit_tokenizer & tokenizer, const std::string & text) {
-    std::vector<std::size_t> tokens;
-    auto normalized_text = replace_all(text, " ", ws_symbol);
-    auto tokenized = encode_word(normalized_text, tokenizer.piece_map);
-
-    return tokenized.first;
-}
-
-std::string replit_tokenizer_detokenize(replit_tokenizer & tokenizer, const std::vector<std::size_t> & tokens) {
-    std::string text;
-    for (auto token : tokens) {
-        text += tokenizer.raw_vocab.id_to_token[token];
-    }
-    auto denormalized_text = replace_all(text, ws_symbol, " ");
-    return denormalized_text;
-}
-
-// no defaults for now
-struct replit_hparams {
-    int32_t d_model = 0;
-    int32_t max_seq_len = 0;
-    int32_t n_heads = 0;
-    int32_t n_layers = 0;
-    int32_t n_vocab = 0;
-    int32_t ftype = 0;
-};
-
-struct replit_layer {
-    // pre normalization
-    struct ggml_tensor * norm_1_weight;
-
-    // attention
-    struct ggml_tensor * c_attn_wqkv_weight;
-    struct ggml_tensor * c_attn_out_proj_weight;
-
-    // post normalization
-    struct ggml_tensor * norm_2_weight;
-
-    // ff
-    struct ggml_tensor * ffn_up_proj;
-    struct ggml_tensor * ffn_down_proj;
-};
-
-struct replit_model {
-    replit_hparams hparams;
-
-    struct ggml_tensor * wte_weight;    // position embedding
-    struct ggml_tensor * norm_f_weight; // language model head
-
-    std::vector<replit_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-// load the model's weights from a file
-bool replit_model_load(const std::string & fname, replit_model & model, replit_tokenizer & vocab) {
-    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *)&magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *)&hparams.d_model, sizeof(hparams.d_model));
-        fin.read((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len));
-        fin.read((char *)&hparams.n_heads, sizeof(hparams.n_heads));
-        fin.read((char *)&hparams.n_layers, sizeof(hparams.n_layers));
-        fin.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *)&hparams.ftype, sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: d_model      = %d\n", __func__, hparams.d_model);
-        printf("%s: max_seq_len  = %d\n", __func__, hparams.max_seq_len);
-        printf("%s: n_heads      = %d\n", __func__, hparams.n_heads);
-        printf("%s: n_layers     = %d\n", __func__, hparams.n_layers);
-        printf("%s: n_vocab      = %d\n", __func__, hparams.n_vocab);
-        printf("%s: ftype        = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr        = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    replit_tokenizer_load(vocab, fin, model.hparams.n_vocab);
-
-    // for the big tensors, we have the option to store the data in 16-bit
-    // floats or quantized in order to save memory and also to speed up the
-    // computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype)(model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", __func__, fname.c_str(),
-                model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd = hparams.d_model;
-        const int n_layer = hparams.n_layers;
-        const int n_ctx = hparams.max_seq_len;
-        const int n_vocab = hparams.n_vocab;
-
-        ctx_size += n_embd * n_vocab * ggml_type_sizef(wtype); // wte_weight
-        ctx_size += n_embd * ggml_type_sizef(GGML_TYPE_F32);   // ln_f_weight
-
-        ctx_size += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32));      // ln_1_weight
-        ctx_size += n_layer * (3 * n_embd * n_embd * ggml_type_sizef(wtype)); // attn_Wqkv_weight
-        ctx_size += n_layer * (n_embd * n_embd * ggml_type_sizef(wtype));     // attn_out_proj_weight
-        ctx_size += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32));      // ln_2_weight
-        ctx_size += n_layer * (4 * n_embd * n_embd * ggml_type_sizef(wtype)); // mlp_mlp_up_weight
-        ctx_size += n_layer * (n_embd * n_embd * 4 * ggml_type_sizef(wtype)); // mlp_mlp_down_weight
-
-        ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_k
-        ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_v
-
-        ctx_size += (1 + 6 * n_layer) * 512; // object overhead
-
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size / (1024.0 * 1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const size_t n_embd = hparams.d_model;
-        const size_t n_layer = hparams.n_layers;
-        const size_t n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        model.wte_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
-        model.norm_f_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        // map by name
-        model.tensors["transformer.wte.weight"] = model.wte_weight;
-        model.tensors["transformer.norm_f.weight"] = model.norm_f_weight;
-
-        for (int i = 0; i < (int)n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.norm_1_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-            layer.c_attn_wqkv_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, 3 * n_embd);
-            layer.c_attn_out_proj_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
-            layer.norm_2_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-            layer.ffn_up_proj = ggml_new_tensor_2d(ctx, wtype, n_embd, 4 * n_embd);
-            layer.ffn_down_proj = ggml_new_tensor_2d(ctx, wtype, 4 * n_embd, n_embd);
-
-            // map by name
-            model.tensors["transformer.blocks." + std::to_string(i) + ".norm_1.weight"] = layer.norm_1_weight;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".attn.Wqkv.weight"] = layer.c_attn_wqkv_weight;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".attn.out_proj.weight"] =
-                layer.c_attn_out_proj_weight;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".norm_2.weight"] = layer.norm_2_weight;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.up_proj.weight"] = layer.ffn_up_proj;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.down_proj.weight"] = layer.ffn_down_proj;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd = hparams.d_model;
-        const int n_layer = hparams.n_layers;
-        const int n_ctx = hparams.max_seq_len;
-
-        const int64_t n_mem = n_layer * n_ctx;
-        const int64_t n_elements = n_embd * n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory_size = %8.2f MB, n_mem = %" PRIu64 "\n", __func__, memory_size / 1024.0 / 1024.0, n_mem);
-    }
-
-    // load weights
-    {
-        int n_tensors = 0;
-        size_t total_size = 0;
-
-        printf("%s: ", __func__);
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = {1, 1};
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            auto tensor = model.tensors[name];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr,
-                        "%s: tensor '%s' has wrong shape in model file: got [%5d, "
-                        "%5d], expected [%5d, %5d]\n",
-                        __func__, name.c_str(), (int)tensor->ne[0], (int)tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1],
-                       ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor) / 1024.0 / 1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements * bpe) / ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr,
-                        "%s: tensor '%s' has wrong size in model file: got %zu, "
-                        "expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements * bpe);
-                return false;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-            total_size += ggml_nbytes(tensor);
-            if (++n_tensors % 8 == 0) {
-                printf(".");
-                fflush(stdout);
-            }
-        }
-
-        printf(" done\n");
-
-        printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size / 1024.0 / 1024.0, n_tensors);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool replit_eval(const replit_model & model, const int n_threads, const int n_past,
-                 const std::vector<gpt_vocab::id> & embd_inp, std::vector<float> & embd_w, bool logits_all,
-                 size_t & mem_per_token) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd = hparams.d_model;
-    const int n_layer = hparams.n_layers;
-    const int n_head = hparams.n_heads;
-    const int n_vocab = hparams.n_vocab;
-    const int n_ctx = hparams.max_seq_len;
-    const float eps = 1e-5f;
-
-    static size_t buf_size = 256u * 1024 * 1024;
-    static void * buf = malloc(buf_size);
-
-    if (mem_per_token > 0 && mem_per_token * N > buf_size) {
-        const size_t buf_size_new = 1.1 * (mem_per_token * N); // add 10% to account for ggml object overhead
-        // printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__,
-        // buf_size, buf_size_new);
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = {};
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd));
-
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte_weight, embd);
-
-    for (int il = 0; il < n_layer; ++il) {
-
-        struct ggml_tensor * cur;
-
-        // a = self.ln_1(x)
-        {
-            cur = ggml_norm(ctx0, inpL, eps);
-
-            cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_1_weight, cur), cur);
-        }
-
-        // self-attention
-        //  b, _, past_key_value = self.attn(a, past_key_value=past_key_value,
-        //  attn_bias=attn_bias, attention_mask=attention_mask,
-        //  is_causal=is_causal)
-        {
-            // compute QKV
-            cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_wqkv_weight, cur);
-
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0 * sizeof(float) * n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1 * sizeof(float) * n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2 * sizeof(float) * n_embd);
-
-            // store key and value to memory
-            {
-                struct ggml_tensor * k =
-                    ggml_view_1d(ctx0, model.memory_k, N * n_embd,
-                                 (ggml_element_size(model.memory_k) * n_embd) * (il * n_ctx + n_past));
-                struct ggml_tensor * v =
-                    ggml_view_1d(ctx0, model.memory_v, N * n_embd,
-                                 (ggml_element_size(model.memory_v) * n_embd) * (il * n_ctx + n_past));
-
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0,
-            // 2, 1, 3) [64, N, 12]
-            struct ggml_tensor * Q = ggml_permute(
-                ctx0, ggml_cpy(ctx0, Qcur, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd / n_head, n_head, N)), 0, 2,
-                1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1,
-            // 3) [64, n_past + N, 12]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                             ggml_reshape_3d(ctx0,
-                                             ggml_view_1d(ctx0, model.memory_k, (n_past + N) * n_embd,
-                                                          il * n_ctx * ggml_element_size(model.memory_k) * n_embd),
-                                             n_embd / n_head, n_head, n_past + N),
-                             0, 2, 1, 3);
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0, KQ, ggml_new_f32(ctx0, 1.0f / sqrt(float(n_embd) / n_head)));
-
-            struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8.0f);
-
-            // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1,
-            // 2, 0, 3).contiguous() [n_past + N, 64, 12]
-            struct ggml_tensor * V_trans = ggml_cpy(
-                ctx0,
-                ggml_permute(ctx0,
-                             ggml_reshape_3d(ctx0,
-                                             ggml_view_1d(ctx0, model.memory_v, (n_past + N) * n_embd,
-                                                          il * n_ctx * ggml_element_size(model.memory_v) * n_embd),
-                                             n_embd / n_head, n_head, n_past + N),
-                             1, 2, 0, 3),
-                ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd / n_head, n_head));
-
-            // KQV = transpose(V) * KQ_soft_max
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection
-            { cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_out_proj_weight, cur); }
-        }
-
-        inpL = ggml_add(ctx0, inpL, cur);
-
-        // m = self.ln_2(x)
-        {
-            cur = ggml_norm(ctx0, inpL, eps);
-
-            cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_2_weight, cur), cur);
-        }
-
-        // n = self.mlp(m)
-        {
-
-            cur = ggml_mul_mat(ctx0, model.layers[il].ffn_up_proj, cur);
-
-            // GELU activation
-            cur = ggml_gelu(ctx0, cur);
-
-            // projection
-            // cur = proj_w*cur + proj_b
-            cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down_proj, cur);
-        }
-
-        // x = x + n
-        inpL = ggml_add(ctx0, inpL, cur);
-    }
-
-    // norm
-    {
-        inpL = ggml_norm(ctx0, inpL, eps);
-        // inpL = ln_f_g*inpL
-        inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_weight, inpL), inpL);
-    }
-
-    // output embedding weight tied to input embedding
-    inpL = ggml_mul_mat(ctx0, model.wte_weight, inpL);
-
-    // logits -> probs
-    // inpL = ggml_soft_max(ctx0, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-
-    // std::cout << "Qcur" << std::endl;
-    // print_tensor(Qcur);
-
-    // if (n_past%100 == 0) {
-    // ggml_graph_print(&gf);
-    // ggml_graph_dump_dot(&gf, NULL, "mpt-model.dot");
-    // }
-
-    if (logits_all) {
-        // return result for all tokens
-        embd_w.resize(n_vocab * N);
-        memcpy(embd_w.data(), (float *)ggml_get_data(inpL), sizeof(float) * n_vocab * N);
-    } else {
-        // return result for just the last token
-        embd_w.resize(n_vocab);
-        memcpy(embd_w.data(), (float *)ggml_get_data(inpL) + (n_vocab * (N - 1)), sizeof(float) * n_vocab);
-    }
-
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0) / N;
-    }
-    // printf("used_mem = %zu\n", ggml_used_mem(ctx0));
-
-    ggml_free(ctx0);
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-    params.model = "";
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        if (!is_stdin_terminal()) {
-            std::string line;
-            while (std::getline(std::cin, line)) {
-                params.prompt = params.prompt + "\n" + line;
-            }
-        } else {
-            params.prompt = gpt_random_prompt(rng);
-        }
-    }
-
-    int64_t t_load_us = 0;
-
-    replit_tokenizer vocab;
-    replit_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!replit_model_load(params.model, model, vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-    }
-
-    int n_past = 0;
-
-    int64_t t_sample_us = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<std::size_t> embd_inp = replit_tokenizer_tokenize(vocab, params.prompt);
-
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-
-    for (size_t i = 0; i < embd_inp.size(); i++) {
-        printf("%s: token[%zu] = %6zu\n", __func__, i, embd_inp[i]);
-        // vocab.id_to_token.at(embd_inp[i]).c_str()
-    }
-    printf("\n");
-
-    params.n_predict = std::min(params.n_predict, model.hparams.max_seq_len - (int)embd_inp.size());
-
-    std::vector<gpt_vocab::id> embd;
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-    replit_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, false, mem_per_token);
-
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!replit_eval(model, params.n_threads, n_past, embd, logits, false, mem_per_token)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp = params.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p(vocab.raw_vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p,
-                                            temp, rng);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (int32_t(embd.size()) > params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", replit_tokenizer_detokenize(vocab, {static_cast<std::size_t>(id)}).c_str());
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (embd.back() == 0) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us / 1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us / 1000.0f,
-               t_predict_us / 1000.0f / n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}

+ 0 - 182
ggml/examples/replit/quantize.cpp

@@ -1,182 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common-ggml.h"
-#include "common.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <regex>
-#include <string>
-#include <vector>
-
-struct mpt_hparams {
-    int32_t d_model     = 0;
-    int32_t max_seq_len = 0;
-    int32_t n_heads     = 0;
-    int32_t n_layers    = 0;
-    int32_t n_vocab     = 0;
-    int32_t ftype       = 0;
-};
-
-// quantize a model
-bool mpt_model_quantize(const std::string & fname_inp,
-                        const std::string & fname_out, ggml_ftype ftype) {
-
-    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
-
-    auto finp = std::ifstream(fname_inp, std::ios::binary);
-    if (!finp) {
-        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__,
-                fname_inp.c_str());
-        return false;
-    }
-
-    auto fout = std::ofstream(fname_out, std::ios::binary);
-    if (!fout) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__,
-                fname_out.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        finp.read((char *)&magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n",
-                    __func__, fname_inp.c_str());
-            return false;
-        }
-
-        fout.write((char *)&magic, sizeof(magic));
-    }
-
-    mpt_hparams hparams;
-
-    // load hparams
-    {
-        finp.read((char *) &hparams.d_model,     sizeof(hparams.d_model));
-        finp.read((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
-        finp.read((char *) &hparams.n_heads,     sizeof(hparams.n_heads));
-        finp.read((char *) &hparams.n_layers,    sizeof(hparams.n_layers));
-        finp.read((char *) &hparams.n_vocab,     sizeof(hparams.n_vocab));
-        finp.read((char *) &hparams.ftype,       sizeof(hparams.ftype));
-
-        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
-        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
-
-        printf("%s: d_model     = %d\n", __func__, hparams.d_model);
-        printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len);
-        printf("%s: n_heads     = %d\n", __func__, hparams.n_heads);
-        printf("%s: n_layers    = %d\n", __func__, hparams.n_layers);
-        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
-        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
-        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
-        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
-
-        fout.write((char *) &hparams.d_model,     sizeof(hparams.d_model));
-        fout.write((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
-        fout.write((char *) &hparams.n_heads,     sizeof(hparams.n_heads));
-        fout.write((char *) &hparams.n_layers,    sizeof(hparams.n_layers));
-        fout.write((char *) &hparams.n_vocab,     sizeof(hparams.n_vocab));
-        fout.write((char *) &ftype_dst,           sizeof(ftype_dst));
-    }
-
-    // load vocab
-    {
-        const int32_t n_vocab = hparams.n_vocab;
-
-        std::string word;
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            finp.read((char *)&len, sizeof(len));
-            fout.write((char *)&len, sizeof(len));
-
-            word.resize(len);
-            finp.read((char *)word.data(), len);
-            fout.write((char *)word.data(), len);
-
-            float prob;
-            finp.read((char *)&prob, sizeof(prob));
-            fout.write((char *)&prob, sizeof(prob));
-        }
-    }
-
-    printf("%s: quantizing tensors\n", __func__);
-
-    // regexes of tensor names to be quantized
-    const std::vector<std::string> to_quant = {
-        ".*weight",
-    };
-
-    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
-        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__,
-                fname_inp.c_str());
-        return false;
-    }
-
-    finp.close();
-    fout.close();
-
-    return true;
-}
-
-// usage:
-//  ./replit-quantize models/replit/ggml-model.bin
-//  models/replit/ggml-model-quant.bin type
-//
-int main(int argc, char ** argv) {
-    if (argc != 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n",
-                argv[0]);
-        ggml_print_ftypes(stderr);
-        return 1;
-    }
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = {0, NULL, false};
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
-
-    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    int64_t t_quantize_us = 0;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!mpt_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
-            fprintf(stderr, "%s: failed to quantize model from '%s'\n",
-                    __func__, fname_inp.c_str());
-            return 1;
-        }
-
-        t_quantize_us = ggml_time_us() - t_start_us;
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n");
-        printf("%s: quantize time = %8.2f ms\n", __func__,
-               t_quantize_us / 1000.0f);
-        printf("%s:    total time = %8.2f ms\n", __func__,
-               (t_main_end_us - t_main_start_us) / 1000.0f);
-    }
-
-    return 0;
-}

+ 0 - 13
ggml/examples/sam/CMakeLists.txt

@@ -1,13 +0,0 @@
-#
-# sam
-
-set(TEST_TARGET sam)
-add_executable(${TEST_TARGET} main.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
-
-#
-# sam-quantize
-
-#set(TEST_TARGET sam-quantize)
-#add_executable(${TEST_TARGET} quantize.cpp)
-#target_link_libraries(${TEST_TARGET} PRIVATE ggml common)

+ 0 - 105
ggml/examples/sam/README.md

@@ -1,105 +0,0 @@
-# SAM.cpp
-
-Inference of Meta's [Segment Anything Model](https://github.com/facebookresearch/segment-anything/) in pure C/C++
-
-## Description
-
-The example currently supports only the [ViT-B SAM model checkpoint](https://huggingface.co/facebook/sam-vit-base).
-
-## Next steps
-
-- [X] Reduce memory usage by utilizing the new ggml-alloc
-- [X] Remove redundant graph nodes
-- [ ] Make inference faster
-- [X] Fix the difference in output masks compared to the PyTorch implementation
-- [X] Filter masks based on stability score
-- [ ] Add support for user input
-- [ ] Support F16 for heavy F32 ops
-- [ ] Test quantization
-- [X] Support bigger model checkpoints
-- [ ] GPU support
-
-## Quick start
-```bash
-git clone https://github.com/ggerganov/ggml
-cd ggml
-
-# Install Python dependencies
-python3 -m pip install -r requirements.txt
-
-# Convert PTH model to ggml
-python convert-pth-to-ggml.py examples/sam/sam_vit_b_01ec64.pth . 1
-
-# Build ggml + examples
-mkdir build && cd build
-cmake .. && make -j4
-
-# run inference
-./bin/sam -t 16 -i ../img.jpg -m examples/sam/ggml-model-f16.bin
-```
-
-## Downloading and converting the model checkpoints
-
-You can download a [model checkpoint](https://github.com/facebookresearch/segment-anything/tree/main#model-checkpoints) and convert it to `ggml` format using the script `convert-pth-to-ggml.py`:
-
-```
-# Convert PTH model to ggml
-python convert-pth-to-ggml.py examples/sam/sam_vit_b_01ec64.pth . 1
-```
-
-## Example output on M2 Ultra
-```
- $ ▶ make -j sam && time ./bin/sam -t 8 -i img.jpg
-[ 28%] Built target common
-[ 71%] Built target ggml
-[100%] Built target sam
-main: seed = 1693224265
-main: loaded image 'img.jpg' (680 x 453)
-sam_image_preprocess: scale = 0.664062
-main: preprocessed image (1024 x 1024)
-sam_model_load: loading model from 'models/sam-vit-b/ggml-model-f16.bin' - please wait ...
-sam_model_load: n_enc_state      = 768
-sam_model_load: n_enc_layer      = 12
-sam_model_load: n_enc_head       = 12
-sam_model_load: n_enc_out_chans  = 256
-sam_model_load: n_pt_embd        = 4
-sam_model_load: ftype            = 1
-sam_model_load: qntvr            = 0
-operator(): ggml ctx size = 202.32 MB
-sam_model_load: ...................................... done
-sam_model_load: model size =   185.05 MB / num tensors = 304
-embd_img
-dims: 64 64 256 1 f32
-First & Last 10 elements:
--0.05117 -0.06408 -0.07154 -0.06991 -0.07212 -0.07690 -0.07508 -0.07281 -0.07383 -0.06779
-0.01589 0.01775 0.02250 0.01675 0.01766 0.01661 0.01811 0.02051 0.02103 0.03382
-sum:  12736.272313
-
-Skipping mask 0 with iou 0.705935 below threshold 0.880000
-Skipping mask 1 with iou 0.762136 below threshold 0.880000
-Mask 2: iou = 0.947081, stability_score = 0.955437, bbox (371, 436), (144, 168)
-
-
-main:     load time =    51.28 ms
-main:    total time =  2047.49 ms
-
-real	0m2.068s
-user	0m16.343s
-sys	0m0.214s
-```
-
-Input point is (414.375, 162.796875) (currently hardcoded)
-
-Input image:
-
-![llamas](https://user-images.githubusercontent.com/8558655/261301565-37b7bf4b-bf91-40cf-8ec1-1532316e1612.jpg)
-
-Output mask (mask_out_2.png in build folder):
-
-![mask_glasses](https://user-images.githubusercontent.com/8558655/263706800-47eeea30-1457-4c87-938b-8f11536c5aa7.png)
-
-## References
-
-- [ggml](https://github.com/ggerganov/ggml)
-- [SAM](https://segment-anything.com/)
-- [SAM demo](https://segment-anything.com/demo)

+ 0 - 147
ggml/examples/sam/convert-pth-to-ggml.py

@@ -1,147 +0,0 @@
-# Convert a SAM model checkpoint to a ggml compatible file
-#
-
-import sys
-import torch
-import struct
-import numpy as np
-
-if len(sys.argv) < 3:
-    print("Usage: convert-pth-to-ggml.py file-model dir-output [ftype]\n")
-    print("  ftype == 0 -> float32")
-    print("  ftype == 1 -> float16")
-    sys.exit(1)
-
-# output in the same directory as the model
-fname_model = sys.argv[1]
-dir_out     = sys.argv[2]
-fname_out   = dir_out + "/ggml-model.bin"
-
-# possible data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-#
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-ftype = 1
-if len(sys.argv) > 3:
-    ftype = int(sys.argv[3])
-
-if ftype < 0 or ftype > 1:
-    print("Invalid ftype: " + str(ftype))
-    sys.exit(1)
-
-fname_out = fname_out.replace(".bin", "-" + ftype_str[ftype] + ".bin")
-
-# Default params are set to sam_vit_b checkpoint
-n_enc_state = 768
-n_enc_layers = 12
-n_enc_heads = 12
-n_enc_out_chans = 256
-n_pt_embd = 4
-
-model = torch.load(fname_model, map_location="cpu")
-for k, v in model.items():
-    print(k, v.shape)
-    if k == "image_encoder.blocks.0.norm1.weight":
-        n_enc_state = v.shape[0]
-
-if n_enc_state == 1024: # sam_vit_l
-    n_enc_layers = 24
-    n_enc_heads  = 16
-elif n_enc_state == 1280: # sam_vit_h
-    n_enc_layers = 32
-    n_enc_heads  = 16
-
-hparams = {
-    "n_enc_state":      n_enc_state,
-    "n_enc_layers":     n_enc_layers,
-    "n_enc_heads":      n_enc_heads,
-    "n_enc_out_chans":  n_enc_out_chans,
-    "n_pt_embd":        n_pt_embd,
-}
-
-print(hparams)
-
-for k, v in model.items():
-    print(k, v.shape)
-
-#exit()
-#code.interact(local=locals())
-
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-fout.write(struct.pack("i", hparams["n_enc_state"]))
-fout.write(struct.pack("i", hparams["n_enc_layers"]))
-fout.write(struct.pack("i", hparams["n_enc_heads"]))
-fout.write(struct.pack("i", hparams["n_enc_out_chans"]))
-fout.write(struct.pack("i", hparams["n_pt_embd"]))
-fout.write(struct.pack("i", ftype))
-
-for k, v in model.items():
-    name = k
-    shape = v.shape
-
-    if name[:19] == "prompt_encoder.mask":
-        continue
-
-    print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
-
-    #data = tf.train.load_variable(dir_model, name).squeeze()
-    #data = v.numpy().squeeze()
-    data = v.numpy()
-    n_dims = len(data.shape)
-
-    # for efficiency - transpose some matrices
-    # "model/h.*/attn/c_attn/w"
-    # "model/h.*/attn/c_proj/w"
-    # "model/h.*/mlp/c_fc/w"
-    # "model/h.*/mlp/c_proj/w"
-    #if name[-14:] == "/attn/c_attn/w" or \
-    #   name[-14:] == "/attn/c_proj/w" or \
-    #   name[-11:] == "/mlp/c_fc/w" or \
-    #   name[-13:] == "/mlp/c_proj/w":
-    #    print("  Transposing")
-    #    data = data.transpose()
-
-    dshape = data.shape
-
-    # default type is fp16
-    ftype_cur = 1
-    if ftype == 0 or n_dims == 1 or \
-            name == "image_encoder.pos_embed" or \
-            name.startswith("prompt_encoder") or \
-            name.startswith("mask_decoder.iou_token") or \
-            name.startswith("mask_decoder.mask_tokens"):
-        print("  Converting to float32")
-        data = data.astype(np.float32)
-        ftype_cur = 0
-    else:
-        print("  Converting to float16")
-        data = data.astype(np.float16)
-
-    # reshape the 1D bias into a 4D tensor so we can use ggml_repeat
-    # keep it in F32 since the data is small
-    if name == "image_encoder.patch_embed.proj.bias":
-        data = data.reshape(1, data.shape[0], 1, 1)
-        n_dims = len(data.shape)
-        dshape = data.shape
-
-    print("  New shape: ", dshape)
-
-    # header
-    str = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
-    fout.write(str)
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")

+ 0 - 2201
ggml/examples/sam/main.cpp

@@ -1,2201 +0,0 @@
-#define _USE_MATH_DEFINES // for M_PI
-#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
-
-#include "ggml.h"
-#include "ggml-alloc.h"
-#define STB_IMAGE_IMPLEMENTATION
-#include "stb_image.h"
-#define STB_IMAGE_WRITE_IMPLEMENTATION
-#include "stb_image_write.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstddef>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include <thread>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// default hparams (ViT-B SAM)
-struct sam_hparams {
-    int32_t n_enc_state               = 768;
-    int32_t n_enc_layer               = 12;
-    int32_t n_enc_head                = 12;
-    int32_t n_enc_out_chans           = 256;
-    int32_t n_pt_embd                 = 4;
-    int32_t n_dec_heads               = 8;
-    int32_t ftype                     = 1;
-    float   mask_threshold            = 0.f;
-    float   iou_threshold             = 0.88f;
-    float   stability_score_threshold = 0.95f;
-    float   stability_score_offset    = 1.0f;
-    float   eps                       = 1e-6f;
-    float   eps_decoder_transformer   = 1e-5f;
-
-    int32_t n_enc_head_dim() const { return n_enc_state / n_enc_head; }
-    int32_t n_img_size()     const { return 1024; }
-    int32_t n_window_size()  const { return 14; }
-    int32_t n_patch_size()   const { return 16; }
-    int32_t n_img_embd()     const { return n_img_size() / n_patch_size(); }
-
-    std::vector<int32_t> global_attn_indices() const {
-        switch (n_enc_state) {
-            case  768: return {  2,  5,  8, 11 };
-            case 1024: return {  5, 11, 17, 23 };
-            case 1280: return {  7, 15, 23, 31 };
-            default:
-                {
-                    fprintf(stderr, "%s: unsupported n_enc_state = %d\n", __func__, n_enc_state);
-                } break;
-        };
-
-        return {};
-    }
-
-    bool is_global_attn(int32_t layer) const {
-        const auto indices = global_attn_indices();
-
-        for (const auto & idx : indices) {
-            if (layer == idx) {
-                return true;
-            }
-        }
-
-        return false;
-    }
-};
-
-struct sam_layer_enc {
-    struct ggml_tensor * norm1_w;
-    struct ggml_tensor * norm1_b;
-
-    struct ggml_tensor * rel_pos_w;
-    struct ggml_tensor * rel_pos_h;
-
-    struct ggml_tensor * qkv_w;
-    struct ggml_tensor * qkv_b;
-
-    struct ggml_tensor * proj_w;
-    struct ggml_tensor * proj_b;
-
-    struct ggml_tensor * norm2_w;
-    struct ggml_tensor * norm2_b;
-
-    struct ggml_tensor * mlp_lin1_w;
-    struct ggml_tensor * mlp_lin1_b;
-
-    struct ggml_tensor * mlp_lin2_w;
-    struct ggml_tensor * mlp_lin2_b;
-};
-
-struct sam_encoder_image {
-    struct ggml_tensor * pe;
-
-    struct ggml_tensor * proj_w;
-    struct ggml_tensor * proj_b;
-
-    struct ggml_tensor * neck_conv_0;
-    struct ggml_tensor * neck_norm_0_w;
-    struct ggml_tensor * neck_norm_0_b;
-    struct ggml_tensor * neck_conv_1;
-    struct ggml_tensor * neck_norm_1_w;
-    struct ggml_tensor * neck_norm_1_b;
-
-    std::vector<sam_layer_enc> layers;
-};
-
-struct sam_encoder_prompt {
-    struct ggml_tensor * pe;
-
-    struct ggml_tensor * not_a_pt_embd_w;
-    std::vector<struct ggml_tensor *> pt_embd;
-
-    struct ggml_tensor * no_mask_embd_w;
-    //std::vector<struct ggml_tensor *> mask_down_w;
-    //std::vector<struct ggml_tensor *> mask_down_b;
-};
-
-struct  sam_layer_dec_transformer_attn {
-    // q_proj
-    struct ggml_tensor * q_w;
-    struct ggml_tensor * q_b;
-
-    // k_proj
-    struct ggml_tensor * k_w;
-    struct ggml_tensor * k_b;
-
-    // v_proj
-    struct ggml_tensor * v_w;
-    struct ggml_tensor * v_b;
-
-    // out_proj
-    struct ggml_tensor * out_w;
-    struct ggml_tensor * out_b;
-};
-
-struct sam_layer_dec_transformer {
-    sam_layer_dec_transformer_attn self_attn;
-
-    // norm1
-    struct ggml_tensor * norm1_w;
-    struct ggml_tensor * norm1_b;
-
-    sam_layer_dec_transformer_attn cross_attn_token_to_img;
-
-    // norm2
-    struct ggml_tensor * norm2_w;
-    struct ggml_tensor * norm2_b;
-
-    // mlp.lin1
-    struct ggml_tensor * mlp_lin1_w;
-    struct ggml_tensor * mlp_lin1_b;
-
-    // mlp.lin2
-    struct ggml_tensor * mlp_lin2_w;
-    struct ggml_tensor * mlp_lin2_b;
-
-    // norm3
-    struct ggml_tensor * norm3_w;
-    struct ggml_tensor * norm3_b;
-
-    // norm4
-    struct ggml_tensor * norm4_w;
-    struct ggml_tensor * norm4_b;
-
-    sam_layer_dec_transformer_attn cross_attn_img_to_token;
-};
-
-struct sam_layer_dec_output_hypernet_mlps {
-    // mlps_*.layers.0
-    struct ggml_tensor * w_0;
-    struct ggml_tensor * b_0;
-
-    // mlps_*.layers.1
-    struct ggml_tensor * w_1;
-    struct ggml_tensor * b_1;
-
-    // mlps_*.layers.2
-    struct ggml_tensor * w_2;
-    struct ggml_tensor * b_2;
-};
-
-struct sam_decoder_mask {
-    std::vector<sam_layer_dec_transformer> transformer_layers;
-
-    // trasnformer.final_attn_token_to_image
-    sam_layer_dec_transformer_attn transformer_final_attn_token_to_img;
-
-    // transformer.norm_final
-    struct ggml_tensor * transformer_norm_final_w;
-    struct ggml_tensor * transformer_norm_final_b;
-
-    // output_upscaling.0
-    struct ggml_tensor * output_upscaling_0_w;
-    struct ggml_tensor * output_upscaling_0_b;
-
-    // output_upscaling.1
-    struct ggml_tensor * output_upscaling_1_w;
-    struct ggml_tensor * output_upscaling_1_b;
-
-    // output_upscaling.3
-    struct ggml_tensor * output_upscaling_3_w;
-    struct ggml_tensor * output_upscaling_3_b;
-
-    // output_hypernetworks_mlps
-    std::vector<sam_layer_dec_output_hypernet_mlps> output_hypernet_mlps;
-
-    // iou_prediction_head.0
-    struct ggml_tensor * iou_prediction_head_0_w;
-    struct ggml_tensor * iou_prediction_head_0_b;
-
-    // iou_prediction_head.1
-    struct ggml_tensor * iou_prediction_head_1_w;
-    struct ggml_tensor * iou_prediction_head_1_b;
-
-    // iou_prediction_head.2
-    struct ggml_tensor * iou_prediction_head_2_w;
-    struct ggml_tensor * iou_prediction_head_2_b;
-
-    // iou_token.weight
-    struct ggml_tensor * iou_token_w;
-
-    // mask_tokens.weight
-    struct ggml_tensor * mask_tokens_w;
-};
-
-
-struct sam_state {
-    struct ggml_tensor * embd_img;
-
-    struct ggml_tensor * low_res_masks;
-    struct ggml_tensor * iou_predictions;
-
-    //struct ggml_tensor * tmp_save = {};
-
-    struct ggml_context * ctx;
-
-    // buffer for `ggml_graph_plan.work_data`
-    std::vector<uint8_t> work_buffer;
-    // buffers to evaluate the model
-    std::vector<uint8_t> buf_alloc_img_enc;
-    std::vector<uint8_t> buf_compute_img_enc;
-
-    std::vector<uint8_t> buf_alloc_fast;
-    std::vector<uint8_t> buf_compute_fast;
-
-    struct ggml_allocr  * allocr = {};
-};
-
-// void save_tensor(sam_state& state, struct ggml_tensor * t, struct ggml_cgraph * gf) {
-//     if (!state.tmp_save) {
-//         state.tmp_save = ggml_new_tensor(state.ctx, t->type, t->n_dims, t->ne);
-//     }
-//     struct ggml_tensor * tmp0 = ggml_cpy(state.ctx, t, state.tmp_save);
-//     ggml_build_forward_expand(gf, tmp0);
-// }
-
-struct sam_model {
-    sam_hparams hparams;
-
-    sam_encoder_image  enc_img;
-    sam_encoder_prompt enc_prompt;
-    sam_decoder_mask   dec;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-struct sam_point {
-    float x;
-    float y;
-};
-
-// RGB uint8 image
-struct sam_image_u8 {
-    int nx;
-    int ny;
-
-    std::vector<uint8_t> data;
-};
-
-// RGB float32 image
-// Memory layout: RGBRGBRGB...
-struct sam_image_f32 {
-    int nx;
-    int ny;
-
-    std::vector<float> data;
-};
-
-void print_t_f32(const char* title, struct ggml_tensor * t, int n = 10) {
-    printf("%s\n", title);
-    float * data = (float *)t->data;
-    printf("dims: %jd %jd %jd %jd f32\n", t->ne[0], t->ne[1], t->ne[2], t->ne[3]);
-    printf("First & Last %d elements:\n", n);
-    for (int i = 0; i < std::min((int) (t->ne[0]*t->ne[1]), n); i++) {
-        printf("%.5f ", data[i]);
-        if (i != 0 && i % t->ne[0] == 0) {
-            printf("\n");
-        }
-    }
-    printf("\n");
-    for (int i = 0; i < std::min((int) (t->ne[0]*t->ne[1]), n); i++) {
-        printf("%.5f ", data[ggml_nelements(t) - n + i]);
-        if ((ggml_nelements(t) - n + i) % t->ne[0] == 0) {
-            printf("\n");
-        }
-    }
-    printf("\n");
-    double sum = 0.0;
-    for (int i = 0; i < ggml_nelements(t); i++) {
-        sum += data[i];
-    }
-    printf("sum:  %f\n\n", sum);
-}
-
-static void ggml_disconnect_node_from_graph(ggml_tensor * t) {
-    t->op = GGML_OP_NONE;
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        t->src[i] = NULL;
-    }
-}
-
-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    ggml_graph_compute(graph, &plan);
-}
-
-static void ggml_sam_sin(struct ggml_tensor * dst , const struct ggml_tensor * src, int ith, int nth, void * userdata) {
-    GGML_ASSERT(userdata == NULL);
-    GGML_ASSERT(ggml_are_same_shape(dst, src));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_is_contiguous(src));
-
-    const float * src_data = ggml_get_data_f32(src);
-    float * dst_data = ggml_get_data_f32(dst);
-
-    const int ne = (int)ggml_nelements(dst);
-    const int dr = (ne + nth - 1) / nth;
-    const int ie0 = dr * ith;
-    const int ie1 = std::min(ie0 + dr, ne);
-
-    for (int i = ie0; i < ie1; ++i) {
-        dst_data[i] = sinf(src_data[i]);
-    }
-}
-
-static void ggml_sam_cos(struct ggml_tensor * dst , const struct ggml_tensor * src, int ith, int nth, void * userdata) {
-    GGML_ASSERT(userdata == NULL);
-    GGML_ASSERT(ggml_are_same_shape(dst, src));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_is_contiguous(src));
-
-    const float * src_data = ggml_get_data_f32(src);
-    float * dst_data = ggml_get_data_f32(dst);
-
-    const int ne = (int)ggml_nelements(dst);
-    const int dr = (ne + nth - 1) / nth;
-    const int ie0 = dr * ith;
-    const int ie1 = std::min(ie0 + dr, ne);
-
-    for (int i = ie0; i < ie1; ++i) {
-        dst_data[i] = cosf(src_data[i]);
-    }
-}
-
-bool sam_image_load_from_file(const std::string & fname, sam_image_u8 & img) {
-    int nx, ny, nc;
-    auto data = stbi_load(fname.c_str(), &nx, &ny, &nc, 3);
-    if (!data) {
-        fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    img.nx = nx;
-    img.ny = ny;
-    img.data.resize(nx * ny * 3);
-    memcpy(img.data.data(), data, nx * ny * 3);
-
-    stbi_image_free(data);
-
-    return true;
-}
-
-// ref: https://github.com/facebookresearch/segment-anything/blob/efeab7296ab579d4a261e554eca80faf6b33924a/segment_anything/modeling/sam.py#L164
-// resize largest dimension to 1024
-// normalize: x = (x - mean) / std
-//     mean = [123.675, 116.28, 103.53]
-//     std  = [58.395, 57.12, 57.375]
-//     TODO: why are these hardcoded !?
-// pad to 1024x1024
-// TODO: for some reason, this is not numerically identical to pytorch's interpolation
-bool sam_image_preprocess(const sam_image_u8 & img, sam_image_f32 & res) {
-    const int nx = img.nx;
-    const int ny = img.ny;
-
-    const int nx2 = 1024;
-    const int ny2 = 1024;
-
-    res.nx = nx2;
-    res.ny = ny2;
-    res.data.resize(3*nx2*ny2);
-
-    const float scale = std::max(nx, ny) / 1024.0f;
-
-    fprintf(stderr, "%s: scale = %f\n", __func__, scale);
-
-    const int nx3 = int(nx/scale + 0.5f);
-    const int ny3 = int(ny/scale + 0.5f);
-
-    const float m3[3] = { 123.675f, 116.280f, 103.530f };
-    const float s3[3] = {  58.395f,  57.120f,  57.375f };
-
-    for (int y = 0; y < ny3; y++) {
-        for (int x = 0; x < nx3; x++) {
-            for (int c = 0; c < 3; c++) {
-                // linear interpolation
-                const float sx = (x + 0.5f)*scale - 0.5f;
-                const float sy = (y + 0.5f)*scale - 0.5f;
-
-                const int x0 = std::max(0, (int) std::floor(sx));
-                const int y0 = std::max(0, (int) std::floor(sy));
-
-                const int x1 = std::min(x0 + 1, nx - 1);
-                const int y1 = std::min(y0 + 1, ny - 1);
-
-                const float dx = sx - x0;
-                const float dy = sy - y0;
-
-                const int j00 = 3*(y0*nx + x0) + c;
-                const int j01 = 3*(y0*nx + x1) + c;
-                const int j10 = 3*(y1*nx + x0) + c;
-                const int j11 = 3*(y1*nx + x1) + c;
-
-                const float v00 = img.data[j00];
-                const float v01 = img.data[j01];
-                const float v10 = img.data[j10];
-                const float v11 = img.data[j11];
-
-                const float v0 = v00*(1.0f - dx) + v01*dx;
-                const float v1 = v10*(1.0f - dx) + v11*dx;
-
-                const float v = v0*(1.0f - dy) + v1*dy;
-
-                const uint8_t v2 = std::min(std::max(std::round(v), 0.0f), 255.0f);
-
-                const int i = 3*(y*nx3 + x) + c;
-
-                res.data[i] = (float(v2) - m3[c]) / s3[c];
-            }
-        }
-    }
-
-    return true;
-}
-
-// load the model's weights from a file
-bool sam_model_load(const std::string & fname, sam_model & model) {
-    fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != 0x67676d6c) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_enc_state,     sizeof(hparams.n_enc_state));
-        fin.read((char *) &hparams.n_enc_layer,     sizeof(hparams.n_enc_layer));
-        fin.read((char *) &hparams.n_enc_head,      sizeof(hparams.n_enc_head));
-        fin.read((char *) &hparams.n_enc_out_chans, sizeof(hparams.n_enc_out_chans));
-        fin.read((char *) &hparams.n_pt_embd,       sizeof(hparams.n_pt_embd));
-        fin.read((char *) &hparams.ftype,           sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_enc_state      = %d\n", __func__, hparams.n_enc_state);
-        printf("%s: n_enc_layer      = %d\n", __func__, hparams.n_enc_layer);
-        printf("%s: n_enc_head       = %d\n", __func__, hparams.n_enc_head);
-        printf("%s: n_enc_out_chans  = %d\n", __func__, hparams.n_enc_out_chans);
-        printf("%s: n_pt_embd        = %d\n", __func__, hparams.n_pt_embd);
-        printf("%s: ftype            = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr            = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    const size_t ctx_size = [&]() {
-        size_t ctx_size = 0;
-
-        const auto & hparams = model.hparams;
-
-        const int32_t n_enc_state     = hparams.n_enc_state;
-        const int32_t n_enc_layer     = hparams.n_enc_layer;
-        const int32_t n_enc_head_dim  = hparams.n_enc_head_dim();
-        const int32_t n_enc_out_chans = hparams.n_enc_out_chans;
-        const int32_t n_pt_embd       = hparams.n_pt_embd;
-
-        const int32_t n_enc_layer_local  = hparams.global_attn_indices().size();
-        const int32_t n_enc_layer_global = n_enc_layer - n_enc_layer_local;
-
-        const int32_t n_img_embd    = hparams.n_img_embd();
-        const int32_t n_window_size = hparams.n_window_size();
-        const int32_t n_patch_size  = hparams.n_patch_size();
-
-        // image encoder
-        {
-            ctx_size += n_enc_state*n_img_embd*n_img_embd*ggml_type_sizef(GGML_TYPE_F32);
-
-            ctx_size += n_enc_state*3*n_patch_size*n_patch_size*ggml_type_sizef(GGML_TYPE_F16);
-            ctx_size += n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
-
-            ctx_size +=     n_enc_state*n_enc_out_chans*1*1*ggml_type_sizef(GGML_TYPE_F16);
-            ctx_size += n_enc_out_chans*n_enc_out_chans*3*3*ggml_type_sizef(GGML_TYPE_F16);
-
-            ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
-            ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
-
-            ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
-            ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
-        }
-
-        // image encoder layers
-        {
-            ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
-            ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
-
-            ctx_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_sizef(GGML_TYPE_F16);
-            ctx_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_sizef(GGML_TYPE_F16);
-
-            ctx_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_sizef(GGML_TYPE_F16);
-            ctx_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_sizef(GGML_TYPE_F16);
-
-            ctx_size += n_enc_layer*3*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
-            ctx_size += n_enc_layer*3*n_enc_state*            ggml_type_sizef(GGML_TYPE_F32);
-
-            ctx_size += n_enc_layer*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
-            ctx_size += n_enc_layer*n_enc_state*            ggml_type_sizef(GGML_TYPE_F32);
-
-            ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
-            ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
-
-            ctx_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
-            ctx_size += n_enc_layer*4*n_enc_state*            ggml_type_sizef(GGML_TYPE_F32);
-
-            ctx_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
-            ctx_size += n_enc_layer*4*n_enc_state*            ggml_type_sizef(GGML_TYPE_F32);
-        }
-
-        ctx_size += (8 + 14*n_enc_layer)*ggml_tensor_overhead();
-
-        // prompt encoder
-        {
-            ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); // 2*(n_enc_out_chans/2)
-
-            ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
-            ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
-        }
-
-        ctx_size += (2 + n_pt_embd)*ggml_tensor_overhead();
-
-        // mask decoder
-        {
-            //transformer
-            {
-                const int tfm_layers_count = 2;
-                const int qkv_count = 3;
-                const int norm_count = 4;
-                const int n_hypernet_mpls_count = 4;
-
-                // self_attn
-                ctx_size += tfm_layers_count*qkv_count*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += tfm_layers_count*qkv_count*n_enc_state*            ggml_type_sizef(GGML_TYPE_F32);
-                ctx_size += tfm_layers_count*n_enc_state*                      ggml_type_sizef(GGML_TYPE_F32);
-
-                // all norms
-                ctx_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
-                ctx_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
-
-                // cross_attn_token_to_img
-                ctx_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += tfm_layers_count*qkv_count*(n_enc_state/2)*            ggml_type_sizef(GGML_TYPE_F32);
-                ctx_size += tfm_layers_count*n_enc_state*                          ggml_type_sizef(GGML_TYPE_F32);
-
-                // mlp
-                ctx_size += tfm_layers_count*8*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += tfm_layers_count*8*n_enc_out_chans*                ggml_type_sizef(GGML_TYPE_F32);
-                ctx_size += tfm_layers_count*n_enc_out_chans*8*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += tfm_layers_count*n_enc_out_chans*                  ggml_type_sizef(GGML_TYPE_F32);
-
-                // cross_attn_img_to_token
-                ctx_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += tfm_layers_count*qkv_count*(n_enc_state/2)*            ggml_type_sizef(GGML_TYPE_F32);
-                ctx_size += tfm_layers_count*n_enc_state*                          ggml_type_sizef(GGML_TYPE_F32);
-
-                // transformer_final_attn_token_to_img
-                ctx_size += qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += qkv_count*(n_enc_state/2)*            ggml_type_sizef(GGML_TYPE_F32);
-                ctx_size += n_enc_state*                          ggml_type_sizef(GGML_TYPE_F32);
-
-                // transformer_norm_final
-                ctx_size += norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
-                ctx_size += norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32);
-
-                // output_upscaling
-                ctx_size += n_enc_out_chans*n_img_embd*2*2*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += 3*n_img_embd*                  ggml_type_sizef(GGML_TYPE_F32);
-                ctx_size += n_enc_out_chans*n_img_embd*(n_img_embd/2)*2*2*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += (n_img_embd/2)*                               ggml_type_sizef(GGML_TYPE_F32);
-
-                // output_hypernetworks_mlps
-                ctx_size += n_hypernet_mpls_count*2*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += n_hypernet_mpls_count*2*n_enc_out_chans*                ggml_type_sizef(GGML_TYPE_F32);
-                ctx_size += n_hypernet_mpls_count*n_enc_out_chans*(n_img_embd/2)*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += n_hypernet_mpls_count*(n_img_embd/2)*                ggml_type_sizef(GGML_TYPE_F32);
-
-                // iou_prediction_head
-                ctx_size += 2*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += 2*n_enc_out_chans*                ggml_type_sizef(GGML_TYPE_F32);
-                ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16);
-                ctx_size += n_pt_embd*                ggml_type_sizef(GGML_TYPE_F32);
-
-                // iou_token_w
-                ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
-
-                // mask_tokens_w
-                ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32);
-            }
-        }
-        fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-
-        return ctx_size;
-    }();
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        ctx = ggml_init(params);
-        if (!ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int32_t n_enc_state      = hparams.n_enc_state;
-        const int32_t n_enc_layer      = hparams.n_enc_layer;
-        const int32_t n_enc_head_dim   = hparams.n_enc_head_dim();
-        const int32_t n_enc_out_chans  = hparams.n_enc_out_chans;
-        const int32_t n_pt_embd        = hparams.n_pt_embd;
-
-        const int32_t n_img_embd    = hparams.n_img_embd();
-        const int32_t n_window_size = hparams.n_window_size();
-        const int32_t n_patch_size  = hparams.n_patch_size();
-
-        model.enc_img.layers.resize(n_enc_layer);
-
-        // image encoder
-        {
-            auto & enc = model.enc_img;
-
-            enc.pe = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_enc_state, n_img_embd, n_img_embd, 1);
-
-            enc.proj_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, n_patch_size, n_patch_size,           3, n_enc_state);
-            enc.proj_b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32,            1,            1, n_enc_state);
-
-            enc.neck_conv_0 = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, n_enc_state,     n_enc_out_chans);
-            enc.neck_conv_1 = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, n_enc_out_chans, n_enc_out_chans);
-
-            enc.neck_norm_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-            enc.neck_norm_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-            enc.neck_norm_1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-            enc.neck_norm_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-            model.tensors["image_encoder.pos_embed"] = enc.pe;
-
-            model.tensors["image_encoder.patch_embed.proj.weight"] = enc.proj_w;
-            model.tensors["image_encoder.patch_embed.proj.bias"]   = enc.proj_b;
-
-            model.tensors["image_encoder.neck.0.weight"] = enc.neck_conv_0;
-            model.tensors["image_encoder.neck.2.weight"] = enc.neck_conv_1;
-
-            model.tensors["image_encoder.neck.1.weight"] = enc.neck_norm_0_w;
-            model.tensors["image_encoder.neck.1.bias"]   = enc.neck_norm_0_b;
-
-            model.tensors["image_encoder.neck.3.weight"] = enc.neck_norm_1_w;
-            model.tensors["image_encoder.neck.3.bias"]   = enc.neck_norm_1_b;
-
-            for (int i = 0; i < n_enc_layer; ++i) {
-                auto & layer = enc.layers[i];
-
-                layer.norm1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_state);
-                layer.norm1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_state);
-
-                if (hparams.is_global_attn(i)) {
-                    layer.rel_pos_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_head_dim, 2*n_img_embd - 1);
-                    layer.rel_pos_h = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_head_dim, 2*n_img_embd - 1);
-                } else {
-                    layer.rel_pos_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_head_dim, 2*n_window_size - 1);
-                    layer.rel_pos_h = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_head_dim, 2*n_window_size - 1);
-                }
-
-                layer.qkv_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16,   n_enc_state, 3*n_enc_state);
-                layer.qkv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_enc_state);
-
-                layer.proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16,  n_enc_state,   n_enc_state);
-                layer.proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,  n_enc_state);
-
-                layer.norm2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_state);
-                layer.norm2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_state);
-
-                layer.mlp_lin1_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16,   n_enc_state, 4*n_enc_state);
-                layer.mlp_lin1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_enc_state);
-
-                layer.mlp_lin2_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 4*n_enc_state,   n_enc_state);
-                layer.mlp_lin2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_enc_state);
-
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".norm1.weight"] = layer.norm1_w;
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".norm1.bias"]   = layer.norm1_b;
-
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".attn.rel_pos_w"] = layer.rel_pos_w;
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".attn.rel_pos_h"] = layer.rel_pos_h;
-
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".attn.qkv.weight"] = layer.qkv_w;
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".attn.qkv.bias"]   = layer.qkv_b;
-
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".attn.proj.weight"] = layer.proj_w;
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".attn.proj.bias"]   = layer.proj_b;
-
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".norm2.weight"] = layer.norm2_w;
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".norm2.bias"]   = layer.norm2_b;
-
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".mlp.lin1.weight"] = layer.mlp_lin1_w;
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".mlp.lin1.bias"]   = layer.mlp_lin1_b;
-
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".mlp.lin2.weight"] = layer.mlp_lin2_w;
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".mlp.lin2.bias"]   = layer.mlp_lin2_b;
-            }
-        }
-
-        // prompt encoder
-        {
-            auto & enc = model.enc_prompt;
-
-            enc.pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_enc_out_chans/2, 2);
-
-            enc.not_a_pt_embd_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-            enc.no_mask_embd_w  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-            model.tensors["prompt_encoder.pe_layer.positional_encoding_gaussian_matrix"] = enc.pe;
-            model.tensors["prompt_encoder.not_a_point_embed.weight"] = enc.not_a_pt_embd_w;
-            model.tensors["prompt_encoder.no_mask_embed.weight"]     = enc.no_mask_embd_w;
-
-            enc.pt_embd.resize(n_pt_embd);
-            for (int i = 0; i < n_pt_embd; i++) {
-                enc.pt_embd[i] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-                model.tensors["prompt_encoder.point_embeddings." + std::to_string(i) + ".weight"] = enc.pt_embd[i];
-            }
-        }
-
-        // mask decoder
-        {
-            auto & dec = model.dec;
-            auto & tfm_layers = dec.transformer_layers;
-
-            const int tfm_layers_count = 2;
-            tfm_layers.resize(tfm_layers_count);
-            for (int i = 0; i < tfm_layers_count; ++i) {
-                auto& l = tfm_layers[i];
-                l.self_attn.q_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
-                l.self_attn.q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-                l.self_attn.k_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
-                l.self_attn.k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-                l.self_attn.v_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
-                l.self_attn.v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-                l.self_attn.out_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
-                l.self_attn.out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-                l.norm1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-                l.norm1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-                l.cross_attn_token_to_img.q_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
-                l.cross_attn_token_to_img.q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
-                l.cross_attn_token_to_img.k_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
-                l.cross_attn_token_to_img.k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
-                l.cross_attn_token_to_img.v_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
-                l.cross_attn_token_to_img.v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
-                l.cross_attn_token_to_img.out_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans/2, n_enc_out_chans);
-                l.cross_attn_token_to_img.out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-                l.norm2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-                l.norm2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-                l.mlp_lin1_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, 8*n_enc_out_chans);
-                l.mlp_lin1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 8*n_enc_out_chans);
-                l.mlp_lin2_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 8*n_enc_out_chans, n_enc_out_chans);
-                l.mlp_lin2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-                l.norm3_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-                l.norm3_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-                l.norm4_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-                l.norm4_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-                l.cross_attn_img_to_token.q_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
-                l.cross_attn_img_to_token.q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
-                l.cross_attn_img_to_token.k_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
-                l.cross_attn_img_to_token.k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
-                l.cross_attn_img_to_token.v_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
-                l.cross_attn_img_to_token.v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
-                l.cross_attn_img_to_token.out_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans/2, n_enc_out_chans);
-                l.cross_attn_img_to_token.out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-                const auto prefix = "mask_decoder.transformer.layers." + std::to_string(i) + ".";
-                model.tensors[prefix + "self_attn.q_proj.weight"] = l.self_attn.q_w;
-                model.tensors[prefix + "self_attn.q_proj.bias"]   = l.self_attn.q_b;
-                model.tensors[prefix + "self_attn.k_proj.weight"] = l.self_attn.k_w;
-                model.tensors[prefix + "self_attn.k_proj.bias"]   = l.self_attn.k_b;
-                model.tensors[prefix + "self_attn.v_proj.weight"] = l.self_attn.v_w;
-                model.tensors[prefix + "self_attn.v_proj.bias"]   = l.self_attn.v_b;
-                model.tensors[prefix + "self_attn.out_proj.weight"] = l.self_attn.out_w;
-                model.tensors[prefix + "self_attn.out_proj.bias"]   = l.self_attn.out_b;
-
-                model.tensors[prefix + "norm1.weight"] = l.norm1_w;
-                model.tensors[prefix + "norm1.bias"]   = l.norm1_b;
-
-                model.tensors[prefix + "cross_attn_token_to_image.q_proj.weight"] = l.cross_attn_token_to_img.q_w;
-                model.tensors[prefix + "cross_attn_token_to_image.q_proj.bias"]   = l.cross_attn_token_to_img.q_b;
-                model.tensors[prefix + "cross_attn_token_to_image.k_proj.weight"] = l.cross_attn_token_to_img.k_w;
-                model.tensors[prefix + "cross_attn_token_to_image.k_proj.bias"]   = l.cross_attn_token_to_img.k_b;
-                model.tensors[prefix + "cross_attn_token_to_image.v_proj.weight"] = l.cross_attn_token_to_img.v_w;
-                model.tensors[prefix + "cross_attn_token_to_image.v_proj.bias"]   = l.cross_attn_token_to_img.v_b;
-                model.tensors[prefix + "cross_attn_token_to_image.out_proj.weight"] = l.cross_attn_token_to_img.out_w;
-                model.tensors[prefix + "cross_attn_token_to_image.out_proj.bias"]   = l.cross_attn_token_to_img.out_b;
-
-                model.tensors[prefix + "norm2.weight"] = l.norm2_w;
-                model.tensors[prefix + "norm2.bias"]   = l.norm2_b;
-
-                model.tensors[prefix + "mlp.lin1.weight"] = l.mlp_lin1_w;
-                model.tensors[prefix + "mlp.lin1.bias"]   = l.mlp_lin1_b;
-                model.tensors[prefix + "mlp.lin2.weight"] = l.mlp_lin2_w;
-                model.tensors[prefix + "mlp.lin2.bias"]   = l.mlp_lin2_b;
-
-                model.tensors[prefix + "norm3.weight"] = l.norm3_w;
-                model.tensors[prefix + "norm3.bias"]   = l.norm3_b;
-                model.tensors[prefix + "norm4.weight"] = l.norm4_w;
-                model.tensors[prefix + "norm4.bias"]   = l.norm4_b;
-
-                model.tensors[prefix + "cross_attn_image_to_token.q_proj.weight"] = l.cross_attn_img_to_token.q_w;
-                model.tensors[prefix + "cross_attn_image_to_token.q_proj.bias"]   = l.cross_attn_img_to_token.q_b;
-                model.tensors[prefix + "cross_attn_image_to_token.k_proj.weight"] = l.cross_attn_img_to_token.k_w;
-                model.tensors[prefix + "cross_attn_image_to_token.k_proj.bias"]   = l.cross_attn_img_to_token.k_b;
-                model.tensors[prefix + "cross_attn_image_to_token.v_proj.weight"] = l.cross_attn_img_to_token.v_w;
-                model.tensors[prefix + "cross_attn_image_to_token.v_proj.bias"]   = l.cross_attn_img_to_token.v_b;
-                model.tensors[prefix + "cross_attn_image_to_token.out_proj.weight"] = l.cross_attn_img_to_token.out_w;
-                model.tensors[prefix + "cross_attn_image_to_token.out_proj.bias"]   = l.cross_attn_img_to_token.out_b;
-            }
-
-            dec.transformer_final_attn_token_to_img.q_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
-            dec.transformer_final_attn_token_to_img.q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
-            dec.transformer_final_attn_token_to_img.k_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
-            dec.transformer_final_attn_token_to_img.k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
-            dec.transformer_final_attn_token_to_img.v_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
-            dec.transformer_final_attn_token_to_img.v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
-            dec.transformer_final_attn_token_to_img.out_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans/2, n_enc_out_chans);
-            dec.transformer_final_attn_token_to_img.out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-            model.tensors["mask_decoder.transformer.final_attn_token_to_image.q_proj.weight"] = dec.transformer_final_attn_token_to_img.q_w;
-            model.tensors["mask_decoder.transformer.final_attn_token_to_image.q_proj.bias"]   = dec.transformer_final_attn_token_to_img.q_b;
-            model.tensors["mask_decoder.transformer.final_attn_token_to_image.k_proj.weight"] = dec.transformer_final_attn_token_to_img.k_w;
-            model.tensors["mask_decoder.transformer.final_attn_token_to_image.k_proj.bias"]   = dec.transformer_final_attn_token_to_img.k_b;
-            model.tensors["mask_decoder.transformer.final_attn_token_to_image.v_proj.weight"] = dec.transformer_final_attn_token_to_img.v_w;
-            model.tensors["mask_decoder.transformer.final_attn_token_to_image.v_proj.bias"]   = dec.transformer_final_attn_token_to_img.v_b;
-            model.tensors["mask_decoder.transformer.final_attn_token_to_image.out_proj.weight"] = dec.transformer_final_attn_token_to_img.out_w;
-            model.tensors["mask_decoder.transformer.final_attn_token_to_image.out_proj.bias"]   = dec.transformer_final_attn_token_to_img.out_b;
-
-            dec.transformer_norm_final_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-            dec.transformer_norm_final_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-            model.tensors["mask_decoder.transformer.norm_final_attn.weight"] = dec.transformer_norm_final_w;
-            model.tensors["mask_decoder.transformer.norm_final_attn.bias"]   = dec.transformer_norm_final_b;
-
-            dec.output_upscaling_0_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 2, 2, n_img_embd, n_enc_out_chans);
-            dec.output_upscaling_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_img_embd);
-            dec.output_upscaling_1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_img_embd);
-            dec.output_upscaling_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_img_embd);
-            dec.output_upscaling_3_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16,  2, 2, n_img_embd/2, n_img_embd);
-            dec.output_upscaling_3_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_img_embd/2);
-
-            model.tensors["mask_decoder.output_upscaling.0.weight"] = dec.output_upscaling_0_w;
-            model.tensors["mask_decoder.output_upscaling.0.bias"]   = dec.output_upscaling_0_b;
-            model.tensors["mask_decoder.output_upscaling.1.weight"] = dec.output_upscaling_1_w;
-            model.tensors["mask_decoder.output_upscaling.1.bias"]   = dec.output_upscaling_1_b;
-            model.tensors["mask_decoder.output_upscaling.3.weight"] = dec.output_upscaling_3_w;
-            model.tensors["mask_decoder.output_upscaling.3.bias"]   = dec.output_upscaling_3_b;
-
-            const int n_hypernet_mpls_count = 4;
-            dec.output_hypernet_mlps.resize(n_hypernet_mpls_count);
-            for (int i = 0; i < n_hypernet_mpls_count; ++i) {
-                auto& mlp = dec.output_hypernet_mlps[i];
-
-                mlp.w_0 = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
-                mlp.b_0 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-                mlp.w_1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
-                mlp.b_1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-                mlp.w_2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_img_embd/2);
-                mlp.b_2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_img_embd/2);
-
-                const auto prefix = "mask_decoder.output_hypernetworks_mlps." + std::to_string(i) + ".";
-                model.tensors[prefix + "layers.0.weight"] = mlp.w_0;
-                model.tensors[prefix + "layers.0.bias"]   = mlp.b_0;
-                model.tensors[prefix + "layers.1.weight"] = mlp.w_1;
-                model.tensors[prefix + "layers.1.bias"]   = mlp.b_1;
-                model.tensors[prefix + "layers.2.weight"] = mlp.w_2;
-                model.tensors[prefix + "layers.2.bias"]   = mlp.b_2;
-            }
-
-            dec.iou_prediction_head_0_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
-            dec.iou_prediction_head_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-            dec.iou_prediction_head_1_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
-            dec.iou_prediction_head_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-            dec.iou_prediction_head_2_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_pt_embd);
-            dec.iou_prediction_head_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_pt_embd);
-
-            dec.iou_token_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_enc_out_chans, 1);
-            dec.mask_tokens_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_enc_out_chans, n_pt_embd);
-
-            model.tensors["mask_decoder.iou_prediction_head.layers.0.weight"] = dec.iou_prediction_head_0_w;
-            model.tensors["mask_decoder.iou_prediction_head.layers.0.bias"]   = dec.iou_prediction_head_0_b;
-            model.tensors["mask_decoder.iou_prediction_head.layers.1.weight"] = dec.iou_prediction_head_1_w;
-            model.tensors["mask_decoder.iou_prediction_head.layers.1.bias"]   = dec.iou_prediction_head_1_b;
-            model.tensors["mask_decoder.iou_prediction_head.layers.2.weight"] = dec.iou_prediction_head_2_w;
-            model.tensors["mask_decoder.iou_prediction_head.layers.2.bias"]   = dec.iou_prediction_head_2_b;
-
-            model.tensors["mask_decoder.iou_token.weight"] = dec.iou_token_w;
-            model.tensors["mask_decoder.mask_tokens.weight"] = dec.mask_tokens_w;
-        }
-    }
-
-    // load weights
-    {
-        int n_tensors = 0;
-        size_t total_size = 0;
-
-        fprintf(stderr, "%s: ", __func__);
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ftype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int64_t nelements = 1;
-            int64_t ne[4] = { 1, 1, 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                int32_t ne_cur;
-                fin.read(reinterpret_cast<char *>(&ne_cur), sizeof(ne_cur));
-                ne[i] = ne_cur;
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name.data()) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
-                return false;
-            }
-
-            auto tensor = model.tensors[name.data()];
-            //printf("ne0 = %jd, ne1 = %jd, ne2 = %jd, ne3 = %jd\n", ne[0], ne[1], ne[2], ne[3]);
-
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %d, expected %d\n",
-                        __func__, name.data(), (int) nelements, (int) ggml_nelements(tensor));
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2] || tensor->ne[3] != ne[3]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d, %d], expected [%d, %d, %d, %d]\n",
-                        __func__, name.data(),
-                        (int) ne[0], (int) ne[1], (int) ne[2], (int) ne[3],
-                        (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], (int) tensor->ne[3]);
-                return false;
-            }
-
-            size_t bpe = 0;
-
-            switch (ftype) {
-                case 0: bpe = ggml_type_size(GGML_TYPE_F32);  break;
-                case 1: bpe = ggml_type_size(GGML_TYPE_F16);  break;
-                case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
-                case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
-                default:
-                        {
-                            fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
-                            return false;
-                        }
-            };
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.data(), ggml_nbytes(tensor), (size_t) nelements*bpe);
-                return false;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-            total_size += ggml_nbytes(tensor);
-            if (++n_tensors % 8 == 0) {
-                fprintf(stderr, ".");
-                fflush(stdout);
-            }
-        }
-
-        if (n_tensors != int(model.tensors.size())) {
-            fprintf(stderr, "%s: model file has %d tensors, but %d tensors were expected\n", __func__, n_tensors, (int) model.tensors.size());
-            return false;
-        }
-
-        fprintf(stderr, " done\n");
-
-        fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-struct ggml_tensor * sam_fill_dense_pe(
-            const sam_model   & model,
-          struct ggml_context * ctx0,
-          struct ggml_cgraph  * gf,
-                  sam_state   & state) {
-    const auto & hparams = model.hparams;
-    const auto & enc     = model.enc_prompt;
-
-    const int32_t n_img_embd = hparams.n_img_embd();
-    const float n_img_embd_inv = 1.0f / n_img_embd;
-
-    struct ggml_tensor * xy_embed_stacked = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 2, n_img_embd, n_img_embd);
-    ggml_allocr_alloc(state.allocr, xy_embed_stacked);
-
-    if (!ggml_allocr_is_measure(state.allocr)) {
-        float * data = (float *) ggml_get_data(xy_embed_stacked);
-        for (int i = 0; i < n_img_embd; ++i) {
-            const int row = 2*i*n_img_embd;
-            const float y_val = 2 * (i + 0.5f) * n_img_embd_inv - 1;
-            for (int j = 0; j < n_img_embd; ++j) {
-                const float x_val = 2 * (j + 0.5f) * n_img_embd_inv - 1;
-                data[row + 2*j + 0] = x_val;
-                data[row + 2*j + 1] = y_val;
-            }
-        }
-    }
-
-    struct ggml_tensor * cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, enc.pe)), xy_embed_stacked);
-
-    cur = ggml_scale(ctx0, cur, ggml_new_f32(ctx0, float(2.0*M_PI)));
-
-    // concat
-    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L192
-    {
-        struct ggml_tensor * t_sin = ggml_map_custom1(ctx0, cur, ggml_sam_sin, GGML_N_TASKS_MAX, NULL);
-        struct ggml_tensor * t_cos = ggml_map_custom1(ctx0, cur, ggml_sam_cos, GGML_N_TASKS_MAX, NULL);
-
-        cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, t_sin->ne[0] + t_cos->ne[0], cur->ne[1], cur->ne[2]);
-
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, t_sin, ggml_view_3d(ctx0, cur, t_sin->ne[0], t_sin->ne[1], t_sin->ne[2], cur->nb[1], cur->nb[2], 0)));
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, t_cos, ggml_view_3d(ctx0, cur, t_sin->ne[0], t_sin->ne[1], t_sin->ne[2], cur->nb[1], cur->nb[2], t_sin->nb[1])));
-    }
-
-    struct ggml_tensor * pe_img_dense = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
-    ggml_build_forward_expand(gf, pe_img_dense);
-
-    return pe_img_dense;
-}
-
-struct ggml_tensor* sam_layer_norm_2d(
-                    struct ggml_context * ctx0,
-                    struct ggml_tensor  * layer,
-                    int                   n_channels,
-                    struct ggml_tensor  * w,
-                    struct ggml_tensor  * b,
-                    float                 eps) {
-    // LayerNorm2d
-    // normalize along channel dimmension
-    // TODO: better implementation
-    layer = ggml_permute(ctx0,
-                ggml_norm(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, layer, 1, 2, 0, 3)), eps),
-                2, 0, 1, 3);
-
-    layer = ggml_add(ctx0,
-              ggml_mul(ctx0,
-                  ggml_repeat(ctx0, ggml_reshape_3d(ctx0, w, 1, 1, n_channels), layer),
-                  layer),
-              ggml_repeat(ctx0, ggml_reshape_3d(ctx0, b, 1, 1, n_channels), layer));
-
-    return layer;
-}
-
-struct ggml_cgraph  * sam_encode_image(
-            const sam_model & model,
-                  sam_state & state,
-        const sam_image_f32 & img) {
-
-    const auto & hparams = model.hparams;
-    const auto & enc     = model.enc_img;
-
-    const int32_t n_enc_state     = hparams.n_enc_state;
-    const int32_t n_enc_layer     = hparams.n_enc_layer;
-    const int32_t n_enc_head      = hparams.n_enc_head;
-    const int32_t n_enc_head_dim  = hparams.n_enc_head_dim();
-    const int32_t n_enc_out_chans = hparams.n_enc_out_chans;
-    const int32_t n_img_size    = hparams.n_img_size();
-    const int32_t n_window_size = hparams.n_window_size();
-
-    struct ggml_init_params ggml_params = {
-        /*.mem_size   =*/ state.buf_compute_img_enc.size(),
-        /*.mem_buffer =*/ state.buf_compute_img_enc.data(),
-        /*.no_alloc   =*/ true, // skip allocating as we use ggml_alloc to allocate exact memory requirements
-    };
-
-    struct ggml_context * ctx0   = ggml_init(ggml_params);
-    struct ggml_cgraph  * gf     = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * inp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_img_size, n_img_size, 3, 1);
-    ggml_allocr_alloc(state.allocr, inp);
-    if (!ggml_allocr_is_measure(state.allocr)) {
-        float * data = (float *) ggml_get_data(inp);
-
-        const int nx = img.nx;
-        const int ny = img.ny;
-        const int n  = nx*ny;
-
-        GGML_ASSERT(nx == n_img_size && ny == n_img_size);
-
-        for (int k = 0; k < 3; k++) {
-            for (int y = 0; y < ny; y++) {
-                for (int x = 0; x < nx; x++) {
-                    data[k*n + y*nx + x] = img.data[3*(y*nx + x) + k];
-                }
-            }
-        }
-    }
-
-    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L392
-    struct ggml_tensor * cur = ggml_conv_2d_sk_p0(ctx0, enc.proj_w, inp);
-    cur = ggml_add_inplace(ctx0,
-            cur,
-            ggml_repeat(ctx0, enc.proj_b, cur));
-
-    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L394
-    // keep in F32
-    cur = ggml_cont(ctx0,
-            ggml_permute(ctx0, cur, 1, 2, 0, 3));
-
-    // convert to F16
-    //cur = ggml_cpy(ctx0,
-    //        ggml_permute(ctx0, cur, 1, 2, 0, 3),
-    //        ggml_new_tensor_3d(ctx0, GGML_TYPE_F16, n_enc_state, n_img_embd, n_img_embd));
-
-    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L108-L109
-    cur = ggml_add_inplace(ctx0, cur, enc.pe);
-
-    struct ggml_tensor * inpL = cur;
-
-    for (int il = 0; il < n_enc_layer; ++il) {
-        const auto & layer = enc.layers[il];
-
-        // norm
-        // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L168
-        {
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
-
-            // cur = ln_0_w*cur + ln_0_b
-            cur = ggml_mul(ctx0, cur, layer.norm1_w);
-            cur = ggml_add_inplace(ctx0, cur, layer.norm1_b);
-        }
-
-        const int64_t w0 = cur->ne[1];
-        const int64_t h0 = cur->ne[2];
-
-        if (hparams.is_global_attn(il) == false) {
-            // local attention layer - apply window partition
-            // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L169-L172
-            cur = ggml_win_part(ctx0, cur, n_window_size);
-        }
-
-        const int64_t W = cur->ne[1];
-        const int64_t H = cur->ne[2];
-
-        // self-attention
-        {
-            cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
-            cur = ggml_add_inplace(ctx0, cur, layer.qkv_b);
-
-            // split qkv into separate tensors
-            // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L225-L229
-            const int B = cur->ne[3];
-
-            cur = ggml_reshape_4d(ctx0, cur, n_enc_state, 3, W*H, B);
-            cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 3, 1, 2));
-
-            struct ggml_tensor * Q;
-            struct ggml_tensor * K;
-            struct ggml_tensor * V;
-
-            Q = ggml_view_3d   (ctx0, cur, n_enc_state, W*H, B, cur->nb[1], cur->nb[2], 0*cur->nb[3]);
-            Q = ggml_reshape_4d(ctx0, Q,   n_enc_head_dim, n_enc_head, W*H, B);
-            Q = ggml_cont      (ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
-            Q = ggml_reshape_3d(ctx0, Q,   n_enc_head_dim, W*H, B*n_enc_head);
-
-            K = ggml_view_3d   (ctx0, cur, n_enc_state, W*H, B, cur->nb[1], cur->nb[2], 1*cur->nb[3]);
-            K = ggml_reshape_4d(ctx0, K,   n_enc_head_dim, n_enc_head, W*H, B);
-            K = ggml_cont      (ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
-            K = ggml_reshape_3d(ctx0, K,   n_enc_head_dim, W*H, B*n_enc_head);
-
-            V = ggml_view_3d   (ctx0, cur, n_enc_state, W*H, B, cur->nb[1], cur->nb[2], 2*cur->nb[3]);
-            V = ggml_reshape_4d(ctx0, V,   n_enc_head_dim, n_enc_head, W*H, B);
-            V = ggml_cont      (ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); // transposed
-            V = ggml_reshape_3d(ctx0, V,   W*H, n_enc_head_dim, B*n_enc_head);
-
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
-                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrtf(n_enc_head_dim))
-                        );
-
-            struct ggml_tensor * rw = ggml_get_rel_pos(ctx0, layer.rel_pos_w, W, W);
-            struct ggml_tensor * rh = ggml_get_rel_pos(ctx0, layer.rel_pos_h, H, H);
-
-            struct ggml_tensor * q_r = ggml_reshape_4d(ctx0, Q, n_enc_head_dim, W, H, B*n_enc_head);
-
-            struct ggml_tensor * rel_w = ggml_cont(ctx0, ggml_permute(ctx0,
-                        ggml_mul_mat(ctx0,
-                            rw,
-                            ggml_cont(ctx0, ggml_permute(ctx0, q_r, 0, 2, 1, 3))),
-                        0, 2, 1, 3));
-            struct ggml_tensor * rel_h = ggml_mul_mat(ctx0, rh, q_r);
-
-            struct ggml_tensor * attn = ggml_add_rel_pos_inplace(ctx0, KQ_scaled, rel_w, rel_h);
-
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, attn);
-
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-
-            cur =
-                ggml_reshape_4d(ctx0,
-                        ggml_cont(ctx0,
-                            ggml_permute(ctx0,
-                                ggml_reshape_4d(ctx0, KQV, n_enc_head_dim, W*H, n_enc_head, B),
-                                0, 2, 1, 3)),
-                        n_enc_state, W, H, B);
-
-            cur = ggml_mul_mat(ctx0, layer.proj_w, cur);
-            cur = ggml_add_inplace(ctx0, cur, layer.proj_b);
-        }
-
-        if (hparams.is_global_attn(il) == false) {
-            // local attention layer - reverse window partition
-            cur = ggml_win_unpart(ctx0, cur, w0, h0, n_window_size);
-        }
-
-        cur = ggml_add_inplace(ctx0, cur, inpL);
-
-        struct ggml_tensor * inpFF = cur;
-
-        // feed-forward network
-        {
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
-
-                // cur = mlp_ln_w*cur + mlp_ln_b
-                cur = ggml_mul(ctx0, cur, layer.norm2_w);
-                cur = ggml_add_inplace(ctx0, cur, layer.norm2_b);
-            }
-
-            // fully connected
-            cur = ggml_mul_mat(ctx0, layer.mlp_lin1_w, cur);
-            cur = ggml_add_inplace(ctx0, cur, layer.mlp_lin1_b);
-
-            // GELU activation
-            cur = ggml_gelu(ctx0, cur);
-
-            // projection
-            cur = ggml_mul_mat(ctx0, layer.mlp_lin2_w, cur);
-            cur = ggml_add_inplace(ctx0, cur, layer.mlp_lin2_b);
-        }
-
-        inpL = ggml_add(ctx0, cur, inpFF);
-    }
-
-    cur = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 2, 0, 1, 3));
-
-    cur = ggml_conv_2d_sk_p0(ctx0, enc.neck_conv_0, cur);
-
-    cur = sam_layer_norm_2d(ctx0, cur, n_enc_out_chans, enc.neck_norm_0_w, enc.neck_norm_0_b, hparams.eps);
-
-    cur = ggml_conv_2d_s1_ph(ctx0, enc.neck_conv_1, cur);
-
-    cur = sam_layer_norm_2d(ctx0, cur, n_enc_out_chans, enc.neck_norm_1_w, enc.neck_norm_1_b, hparams.eps);
-
-    cur = ggml_cpy(ctx0, cur, state.embd_img);
-
-    ggml_build_forward_expand(gf, cur);
-    ggml_disconnect_node_from_graph(state.embd_img);
-
-    //ggml_graph_print(&gf);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-
-struct prompt_encoder_result {
-    struct ggml_tensor * embd_prompt_sparse = {};
-    struct ggml_tensor * embd_prompt_dense = {};
-};
-
-// encode a prompt
-//
-// - points
-// - boxes
-// - masks
-//
-// TODO: currently just encode a single point for simplicity
-//
-prompt_encoder_result sam_encode_prompt(
-        const sam_model     & model,
-        struct ggml_context * ctx0,
-        struct ggml_cgraph  * gf,
-                  sam_state & state,
-                        int   nx,
-                        int   ny,
-                  sam_point   point) {
-
-    const auto & hparams = model.hparams;
-    const auto & enc = model.enc_prompt;
-
-    // transform points
-    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/automatic_mask_generator.py#L276
-    {
-        const int nmax = std::max(nx, ny);
-
-        const float scale = hparams.n_img_size() / (float) nmax;
-
-        const int nx_new = int(nx*scale + 0.5f);
-        const int ny_new = int(ny*scale + 0.5f);
-
-        point.x = point.x*(float(nx_new)/nx) + 0.5f;
-        point.y = point.y*(float(ny_new)/ny) + 0.5f;
-    }
-
-    struct ggml_tensor * inp = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2, 2);
-
-    ggml_allocr_alloc(state.allocr, inp);
-    if (!ggml_allocr_is_measure(state.allocr)) {
-        // set the input by converting the [0, 1] coordinates to [-1, 1]
-        float * data = (float *) inp->data;
-
-        data[0] = 2.0f*(point.x / hparams.n_img_size()) - 1.0f;
-        data[1] = 2.0f*(point.y / hparams.n_img_size()) - 1.0f;
-
-        // padding
-        // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L81-L85
-        data[2] = 2.0f*(0.0f) - 1.0f;
-        data[3] = 2.0f*(0.0f) - 1.0f;
-    }
-
-    struct ggml_tensor * cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, enc.pe)), inp);
-
-    cur = ggml_scale(ctx0, cur, ggml_new_f32(ctx0, float(2.0*M_PI)));
-
-    // concat
-    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L192
-    {
-        struct ggml_tensor * t_sin = ggml_map_custom1(ctx0, cur, ggml_sam_sin, GGML_N_TASKS_MAX, NULL);
-        struct ggml_tensor * t_cos = ggml_map_custom1(ctx0, cur, ggml_sam_cos, GGML_N_TASKS_MAX, NULL);
-
-        cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, t_sin->ne[0] + t_cos->ne[0], cur->ne[1]);
-
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, t_sin, ggml_view_2d(ctx0, cur, t_sin->ne[0], t_sin->ne[1], cur->nb[1], 0)));
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, t_cos, ggml_view_2d(ctx0, cur, t_sin->ne[0], t_sin->ne[1], cur->nb[1], t_sin->nb[1])));
-
-        // overwrite label == -1 with not_a_point_embed.weight
-        // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L86
-        // TODO: extend for multiple points
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, enc.not_a_pt_embd_w, ggml_view_2d(ctx0, cur, cur->ne[0], 1, cur->nb[1], cur->nb[1])));
-    }
-
-    // add point_embeddings[1] to label == 1
-    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L90
-    struct ggml_tensor * v = ggml_view_2d(ctx0, cur, cur->ne[0], 1, cur->nb[1], 0);
-    ggml_build_forward_expand(gf, ggml_cpy(ctx0, ggml_add_inplace(ctx0, v, enc.pt_embd[1]), v));
-
-    struct ggml_tensor * embd_prompt_sparse = cur;
-    ggml_build_forward_expand(gf, embd_prompt_sparse);
-
-    struct ggml_tensor * embd_prompt_dense = ggml_repeat(ctx0,
-            ggml_cont(ctx0,
-                ggml_view_3d(ctx0, enc.no_mask_embd_w,
-                    1, 1, enc.no_mask_embd_w->ne[0], enc.no_mask_embd_w->nb[0], enc.no_mask_embd_w->nb[0], 0)),
-            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hparams.n_img_embd(), hparams.n_img_embd(), hparams.n_enc_out_chans));
-
-    ggml_build_forward_expand(gf, embd_prompt_dense);
-
-    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
-
-    prompt_encoder_result res;
-    res.embd_prompt_sparse = embd_prompt_sparse;
-    res.embd_prompt_dense  = embd_prompt_dense;
-    return res;
-}
-
-struct ggml_tensor* sam_decode_mask_transformer_attn(
-    const sam_layer_dec_transformer_attn & attn,
-                      struct ggml_tensor * queries,
-                      struct ggml_tensor * keys,
-                      struct ggml_tensor * values,
-                     struct ggml_context * ctx0,
-                         const sam_model & model) {
-    const auto & hparams = model.hparams;
-    const int n_head = hparams.n_dec_heads;
-
-    struct ggml_tensor * Qcur = {};
-    struct ggml_tensor * Kcur = {};
-    struct ggml_tensor * Vcur = {};
-
-    Qcur = ggml_mul_mat(ctx0, attn.q_w, queries);
-    Qcur = ggml_add_inplace(ctx0, Qcur, attn.q_b);
-
-    Kcur = ggml_mul_mat(ctx0, attn.k_w, keys);
-    Kcur = ggml_add_inplace(ctx0, Kcur, attn.k_b);
-
-    Vcur = ggml_mul_mat(ctx0, attn.v_w, values);
-    Vcur = ggml_add_inplace(ctx0, Vcur, attn.v_b);
-
-    struct ggml_tensor * Q = {};
-    struct ggml_tensor * K = {};
-    struct ggml_tensor * V = {};
-
-    Q = ggml_reshape_4d(ctx0, Qcur, Qcur->ne[0]/n_head, n_head, Qcur->ne[1], Qcur->ne[2]);
-    Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
-
-    K = ggml_reshape_4d(ctx0, Kcur, Kcur->ne[0]/n_head, n_head, Kcur->ne[1], Kcur->ne[2]);
-    K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
-
-    V = ggml_reshape_4d(ctx0, Vcur, Vcur->ne[0]/n_head, n_head, Vcur->ne[1], Vcur->ne[2]);
-    V = ggml_cont(ctx0, ggml_permute(ctx0, V, 0, 2, 1, 3));
-
-    // Q * K
-    struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-    struct ggml_tensor * KQ_scaled =
-        ggml_scale_inplace(ctx0,
-                KQ,
-                ggml_new_f32(ctx0, 1.0f/sqrt(float(Q->ne[0]))));
-
-    struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);
-
-    struct ggml_tensor * KQV = ggml_mul_mat(ctx0, KQ_soft_max, ggml_cont(ctx0, ggml_transpose(ctx0, V)));
-
-    struct ggml_tensor * KQV_merged = ggml_cont(ctx0, ggml_transpose(ctx0, KQV));
-    KQV_merged = ggml_cont(ctx0, ggml_permute(ctx0, KQV_merged, 0, 2, 1, 3));
-    KQV_merged = ggml_reshape_3d(ctx0, KQV_merged, KQV_merged->ne[0]*KQV_merged->ne[1], KQV_merged->ne[2], KQV_merged->ne[3]);
-    KQV_merged = ggml_mul_mat(ctx0, attn.out_w, KQV_merged);
-    KQV_merged = ggml_add_inplace(ctx0, KQV_merged, attn.out_b);
-
-    return KQV_merged;
-}
-
-struct ggml_tensor * sam_decode_mask_mlp_relu_3(
-     struct ggml_tensor * in,
-     struct ggml_tensor * w_0,
-     struct ggml_tensor * b_0,
-     struct ggml_tensor * w_1,
-     struct ggml_tensor * b_1,
-     struct ggml_tensor * w_2,
-     struct ggml_tensor * b_2,
-    struct ggml_context * ctx0) {
-
-    struct ggml_tensor * cur = {};
-    cur = ggml_mul_mat(ctx0, w_0, in);
-    cur = ggml_add_inplace(ctx0, cur, b_0);
-
-    cur = ggml_relu_inplace(ctx0, cur);
-
-    cur = ggml_mul_mat(ctx0, w_1, cur);
-    cur = ggml_add_inplace(ctx0, cur, b_1);
-
-    cur = ggml_relu_inplace(ctx0, cur);
-
-    cur = ggml_mul_mat(ctx0, w_2, cur);
-    cur = ggml_add_inplace(ctx0, cur, b_2);
-
-    return cur;
-}
-
-bool sam_decode_mask(
-                    const sam_model & model,
-        const prompt_encoder_result & prompt,
-                 struct ggml_tensor * pe_img,
-                struct ggml_context * ctx0,
-                struct ggml_cgraph  * gf,
-                          sam_state & state) {
-
-    const auto & hparams = model.hparams;
-    const auto & dec = model.dec;
-    const int n_img_embd = hparams.n_img_embd();
-
-    struct ggml_tensor * tokens = {};
-    {
-        // Concatenate output tokens
-        // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/mask_decoder.py#L120
-        const auto& sparse = prompt.embd_prompt_sparse;
-
-        tokens = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, dec.iou_token_w->ne[0], dec.iou_token_w->ne[1] + dec.mask_tokens_w->ne[1] + sparse->ne[1], sparse->ne[2]);
-
-        const size_t offsets[3] = { 0, dec.iou_token_w->ne[1]*tokens->nb[1], dec.iou_token_w->ne[1]*tokens->nb[1] + dec.mask_tokens_w->ne[1]*tokens->nb[1] };
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, dec.iou_token_w,   ggml_view_2d(ctx0, tokens, tokens->ne[0], dec.iou_token_w->ne[1],   tokens->nb[1], offsets[0])));
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, dec.mask_tokens_w, ggml_view_2d(ctx0, tokens, tokens->ne[0], dec.mask_tokens_w->ne[1], tokens->nb[1], offsets[1])));
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, sparse,            ggml_view_2d(ctx0, tokens, tokens->ne[0], sparse->ne[1],            tokens->nb[1], offsets[2])));
-        // TODO: Sparse prompt embeddings can have more than one point
-    }
-
-
-    struct ggml_tensor * src = {};
-    struct ggml_tensor * pos_src = {};
-    int srcNE[4] = { 0, 0, 0, 0 };
-    {
-        // Expand per-image data in the batch direction to be per-mask
-        // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/mask_decoder.py#L125
-        src = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, state.embd_img->ne[0], state.embd_img->ne[1], state.embd_img->ne[2], tokens->ne[2]);
-
-        src = ggml_add(ctx0,
-            ggml_repeat(ctx0,
-                state.embd_img,
-                src),
-            prompt.embd_prompt_dense);
-
-        srcNE[0] = src->ne[0];
-        srcNE[1] = src->ne[1];
-        srcNE[2] = src->ne[2];
-        srcNE[3] = src->ne[3];
-
-        // flatten & permute
-        // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L83
-        src = ggml_cont(ctx0, ggml_permute(ctx0,
-            ggml_view_3d(ctx0,
-                src,
-                src->ne[0]*src->ne[1],
-                src->ne[2],
-                src->ne[3],
-                src->nb[2],
-                src->nb[3],
-                0),
-            1, 0, 2, 3));
-
-        pos_src = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, pe_img->ne[0], pe_img->ne[1], pe_img->ne[2], tokens->ne[2]);
-        pos_src = ggml_repeat(ctx0,
-            pe_img,
-            pos_src);
-
-        // flatten & permute
-        // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L83
-        pos_src = ggml_cont(ctx0, ggml_permute(ctx0,
-            ggml_view_3d(ctx0,
-                pos_src,
-                pos_src->ne[0]*pos_src->ne[1],
-                pos_src->ne[2],
-                pos_src->ne[3],
-                pos_src->nb[2],
-                pos_src->nb[3],
-                0),
-            1, 0, 2, 3));
-    }
-
-    struct ggml_tensor * queries = tokens;
-    struct ggml_tensor * keys = src;
-    {
-        // Run the transformer
-        // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L62
-        for (int i = 0; i < int(model.dec.transformer_layers.size()); ++i) {
-            const auto& tfm_layer = model.dec.transformer_layers[i];
-
-            // Self attention block
-            // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L154
-            const bool skip_first_layer_pe = i == 0;
-            if (skip_first_layer_pe) {
-                queries = sam_decode_mask_transformer_attn(tfm_layer.self_attn, queries, queries, queries, ctx0, model);
-            }
-            else {
-                struct ggml_tensor * q_0 = ggml_add(ctx0, queries, tokens);
-
-                struct ggml_tensor * self_attn = sam_decode_mask_transformer_attn(tfm_layer.self_attn, q_0, q_0, queries, ctx0, model);
-                queries = ggml_add(ctx0, queries, self_attn);
-            }
-
-            queries = ggml_norm(ctx0, queries, hparams.eps_decoder_transformer);
-            queries = ggml_add_inplace(ctx0,
-                    ggml_mul(ctx0, queries, tfm_layer.norm1_w),
-                    tfm_layer.norm1_b);
-
-            // Cross attention block, tokens attending to image embedding
-            // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L163
-            struct ggml_tensor * q_1 = ggml_add(ctx0, queries, tokens);
-            struct ggml_tensor * k_1 = ggml_add(ctx0, keys, pos_src);
-
-            struct ggml_tensor * cross_attn_token_to_img = sam_decode_mask_transformer_attn(tfm_layer.cross_attn_token_to_img, q_1, k_1, keys, ctx0, model);
-
-            queries = ggml_add_inplace(ctx0, queries, cross_attn_token_to_img);
-            queries = ggml_norm_inplace(ctx0, queries, hparams.eps_decoder_transformer);
-            queries = ggml_add_inplace(ctx0,
-                    ggml_mul(ctx0, queries, tfm_layer.norm2_w),
-                    tfm_layer.norm2_b);
-
-            // MLP block
-            // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L170
-            struct ggml_tensor * mlp_out = ggml_mul_mat(ctx0,
-                tfm_layer.mlp_lin1_w,
-                queries);
-
-            mlp_out = ggml_add_inplace(ctx0, mlp_out, tfm_layer.mlp_lin1_b);
-
-            // RELU activation
-            mlp_out = ggml_relu_inplace(ctx0, mlp_out);
-            mlp_out = ggml_mul_mat(ctx0, tfm_layer.mlp_lin2_w, mlp_out);
-
-            mlp_out = ggml_add_inplace(ctx0, mlp_out, tfm_layer.mlp_lin2_b);
-
-            queries = ggml_add_inplace(ctx0, queries, mlp_out);
-            queries = ggml_norm_inplace(ctx0, queries, hparams.eps_decoder_transformer);
-            queries = ggml_add_inplace(ctx0,
-                    ggml_mul(ctx0, queries, tfm_layer.norm3_w),
-                    tfm_layer.norm3_b);
-
-            // Cross attention block, image embedding attending to tokens
-            // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L175
-            struct ggml_tensor * q_2 = ggml_add(ctx0, queries, tokens);
-            struct ggml_tensor * k_2 = ggml_add(ctx0, keys, pos_src);
-
-            struct ggml_tensor * cross_attn_img_to_token = sam_decode_mask_transformer_attn(tfm_layer.cross_attn_img_to_token, k_2, q_2, queries, ctx0, model);
-            keys = ggml_add_inplace(ctx0, keys, cross_attn_img_to_token);
-            keys = ggml_norm_inplace(ctx0, keys, hparams.eps_decoder_transformer);
-            keys = ggml_add_inplace(ctx0,
-                    ggml_mul(ctx0, keys, tfm_layer.norm4_w),
-                    tfm_layer.norm4_b);
-        }
-
-        // Apply the final attention layer from the points to the image
-        // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L99
-        struct ggml_tensor * q = ggml_add(ctx0, queries, tokens);
-        struct ggml_tensor * k = ggml_add(ctx0, keys, pos_src);
-
-        struct ggml_tensor * final_attn_token_to_img = sam_decode_mask_transformer_attn(dec.transformer_final_attn_token_to_img, q, k, keys, ctx0, model);
-
-        queries = ggml_add_inplace(ctx0, queries, final_attn_token_to_img);
-        queries = ggml_norm_inplace(ctx0, queries, hparams.eps_decoder_transformer);
-        queries = ggml_add_inplace(ctx0,
-                ggml_mul(ctx0, queries, dec.transformer_norm_final_w),
-                dec.transformer_norm_final_b);
-    }
-
-
-    struct ggml_tensor * iou_pred = ggml_view_2d(ctx0, queries, queries->ne[0], queries->ne[2], queries->nb[2], 0);
-    const int num_mask_tokens = 4; // num_multimask_outputs + 1
-    struct ggml_tensor * mask_tokens_out = ggml_view_3d(ctx0, queries, queries->ne[0], num_mask_tokens, queries->ne[2], queries->nb[1], num_mask_tokens*queries->nb[1], queries->nb[1]);
-
-    // Upscale mask embeddings and predict masks using the mask tokens
-    // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/mask_decoder.py#L136
-    keys = ggml_cont(ctx0, ggml_transpose(ctx0, keys));
-    keys = ggml_view_4d(ctx0, keys, srcNE[0], srcNE[1], srcNE[2], srcNE[3], srcNE[0]*keys->nb[0], keys->nb[1], keys->nb[2], 0);
-    // ggml_build_forward_expand(gf, keys);
-    struct ggml_tensor * upscaled_embedding = {};
-    {
-        // ConvTranspose2d
-        keys = ggml_conv_transpose_2d_p0(ctx0, dec.output_upscaling_0_w, keys, 2);
-        ggml_allocr_alloc(state.allocr, keys); // TODO: This alloc shouldn't be needed
-        keys = ggml_add_inplace(ctx0, keys, ggml_repeat(ctx0,
-                                     ggml_reshape_3d(ctx0, dec.output_upscaling_0_b, 1, 1, dec.output_upscaling_0_b->ne[0]),
-                                     keys));
-
-        keys = sam_layer_norm_2d(ctx0, keys, n_img_embd, dec.output_upscaling_1_w, dec.output_upscaling_1_b, hparams.eps);
-
-        // GELU activation
-        keys = ggml_gelu_inplace(ctx0, keys);
-
-        // ConvTranspose2d
-        keys = ggml_conv_transpose_2d_p0(ctx0, dec.output_upscaling_3_w, keys, 2);
-        ggml_allocr_alloc(state.allocr, keys); // TODO: This alloc shouldn't be needed
-        keys = ggml_add_inplace(ctx0, ggml_repeat(ctx0,
-                                ggml_reshape_3d(ctx0, dec.output_upscaling_3_b, 1, 1, dec.output_upscaling_3_b->ne[0]),
-                                keys), keys);
-        // GELU activation
-        keys = ggml_gelu_inplace(ctx0, keys);
-        upscaled_embedding = ggml_reshape_3d(ctx0, keys, keys->ne[0]*keys->ne[1], keys->ne[2], keys->ne[3]);
-        upscaled_embedding = ggml_cont(ctx0, ggml_transpose(ctx0, upscaled_embedding)); // TODO: Shouldn't be needed
-    }
-
-    struct ggml_tensor * hyper_in = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_img_embd/2, num_mask_tokens, mask_tokens_out->ne[2]);
-
-    for (int i = 0; i < num_mask_tokens; ++i) {
-        const auto& mlp = dec.output_hypernet_mlps[i];
-        struct ggml_tensor * in = ggml_view_2d(ctx0, mask_tokens_out, mask_tokens_out->ne[0], mask_tokens_out->ne[2], mask_tokens_out->nb[1], i*mask_tokens_out->nb[1]);
-        struct ggml_tensor * out = sam_decode_mask_mlp_relu_3(in, mlp.w_0, mlp.b_0, mlp.w_1, mlp.b_1, mlp.w_2, mlp.b_2, ctx0);
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, out, ggml_view_2d(ctx0, hyper_in, hyper_in->ne[0], hyper_in->ne[2], hyper_in->nb[1], i*hyper_in->nb[1])));
-    }
-
-    struct ggml_tensor * masks = ggml_mul_mat(ctx0, hyper_in, upscaled_embedding);
-    masks = ggml_cont(ctx0, ggml_transpose(ctx0, masks)); // TODO: Shouldn't be needed
-    masks = ggml_reshape_4d(ctx0, masks, keys->ne[0], keys->ne[1], masks->ne[1], keys->ne[3]);
-
-    // Generate mask quality predictions
-    // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/mask_decoder.py#L146
-    iou_pred = sam_decode_mask_mlp_relu_3(iou_pred, dec.iou_prediction_head_0_w, dec.iou_prediction_head_0_b, dec.iou_prediction_head_1_w, dec.iou_prediction_head_1_b, dec.iou_prediction_head_2_w, dec.iou_prediction_head_2_b, ctx0);
-
-    // Select the correct mask or masks for output
-    // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/mask_decoder.py#L101
-    iou_pred = ggml_cpy(state.ctx, ggml_view_1d(ctx0, iou_pred, iou_pred->ne[0] - 1, iou_pred->nb[0]), state.iou_predictions);
-    masks = ggml_view_4d(ctx0, masks, masks->ne[0], masks->ne[1], masks->ne[2] - 1, masks->ne[3],
-                                      masks->nb[1], masks->nb[2], masks->nb[3], masks->nb[2] /* offset*/);
-    masks = ggml_cpy(state.ctx, masks, state.low_res_masks);
-
-    ggml_build_forward_expand(gf, masks);
-    ggml_build_forward_expand(gf, iou_pred);
-
-    ggml_disconnect_node_from_graph(state.low_res_masks);
-    ggml_disconnect_node_from_graph(state.iou_predictions);
-
-    return true;
-}
-
-bool sam_write_masks(const sam_hparams& hparams, int nx, int ny, const sam_state & state) {
-    if (state.low_res_masks->ne[2] == 0) return true;
-    if (state.low_res_masks->ne[2] != state.iou_predictions->ne[0]) {
-        printf("Error: number of masks (%d) does not match number of iou predictions (%d)\n", (int)state.low_res_masks->ne[2], (int)state.iou_predictions->ne[0]);
-        return false;
-    }
-
-    const int n_img_size = hparams.n_img_size();
-    const float mask_threshold = hparams.mask_threshold;
-    const float iou_threshold = hparams.iou_threshold;
-    const float stability_score_threshold = hparams.stability_score_threshold;
-    const float intersection_threshold = mask_threshold + hparams.stability_score_offset;
-    const float union_threshold = mask_threshold - hparams.stability_score_offset;
-
-    const int ne0 = state.low_res_masks->ne[0];
-    const int ne1 = state.low_res_masks->ne[1];
-    const int ne2 = state.low_res_masks->ne[2];
-
-    // Remove padding and upscale masks to the original image size.
-    // ref: https://github.com/facebookresearch/segment-anything/blob/efeab7296ab579d4a261e554eca80faf6b33924a/segment_anything/modeling/sam.py#L140
-
-    const float preprocess_scale = std::max(nx, ny) / float(n_img_size);
-    const int cropped_nx = int(nx / preprocess_scale + 0.5f);
-    const int cropped_ny = int(ny / preprocess_scale + 0.5f);
-
-    const float scale_x_1 = (float)ne0 / (float)n_img_size;
-    const float scale_y_1 = (float)ne1 / (float)n_img_size;
-
-    const float scale_x_2 = float(cropped_nx) / float(nx);
-    const float scale_y_2 = float(cropped_ny) / float(ny);
-
-    const auto iou_data = (float*)state.iou_predictions->data;
-
-    for (int i = 0; i < ne2; ++i) {
-        if (iou_threshold > 0.f && iou_data[i] < iou_threshold) {
-            printf("Skipping mask %d with iou %f below threshold %f\n", i, iou_data[i], iou_threshold);
-            continue; // Filtering masks with iou below the threshold
-        }
-
-        std::vector<float> mask_data(n_img_size*n_img_size);
-        {
-            const float* data = (float *) state.low_res_masks->data + i*ne0*ne1;
-
-            for (int iy = 0; iy < n_img_size; ++iy) {
-                for (int ix = 0; ix < n_img_size; ++ix) {
-                    const float sx = std::max(scale_x_1*(ix + 0.5f) - 0.5f, 0.0f);
-                    const float sy = std::max(scale_y_1*(iy + 0.5f) - 0.5f, 0.0f);
-
-                    const int x0 = std::max(0, (int)sx);
-                    const int y0 = std::max(0, (int)sy);
-
-                    const int x1 = std::min(x0 + 1, ne0 - 1);
-                    const int y1 = std::min(y0 + 1, ne1 - 1);
-
-                    const float dx = sx - x0;
-                    const float dy = sy - y0;
-
-                    const int j00 = y0*ne0 + x0;
-                    const int j01 = y0*ne0 + x1;
-                    const int j10 = y1*ne0 + x0;
-                    const int j11 = y1*ne0 + x1;
-
-                    const float v00 = data[j00];
-                    const float v01 = data[j01];
-                    const float v10 = data[j10];
-                    const float v11 = data[j11];
-
-                    const float v0 = (1-dx)*v00 + dx*v01;
-                    const float v1 = (1-dx)*v10 + dx*v11;
-
-                    const float v = (1-dy)*v0 + dy*v1;
-
-                    mask_data[iy*n_img_size + ix] = v;
-                }
-            }
-        }
-
-        int intersections = 0;
-        int unions = 0;
-        sam_image_u8 res;
-        int min_iy = ny;
-        int max_iy = 0;
-        int min_ix = nx;
-        int max_ix = 0;
-        {
-            const float* data = mask_data.data();
-
-            res.nx = nx;
-            res.ny = ny;
-            res.data.resize(nx*ny);
-
-            for (int iy = 0; iy < ny; ++iy) {
-                for (int ix = 0; ix < nx; ++ix) {
-                    const float sx = std::max(scale_x_2*(ix + 0.5f) - 0.5f, 0.0f);
-                    const float sy = std::max(scale_y_2*(iy + 0.5f) - 0.5f, 0.0f);
-
-                    const int x0 = std::max(0, (int)sx);
-                    const int y0 = std::max(0, (int)sy);
-
-                    const int x1 = std::min(x0 + 1, cropped_nx - 1);
-                    const int y1 = std::min(y0 + 1, cropped_ny - 1);
-
-                    const float dx = sx - x0;
-                    const float dy = sy - y0;
-
-                    const int j00 = y0*n_img_size + x0;
-                    const int j01 = y0*n_img_size + x1;
-                    const int j10 = y1*n_img_size + x0;
-                    const int j11 = y1*n_img_size + x1;
-
-                    const float v00 = data[j00];
-                    const float v01 = data[j01];
-                    const float v10 = data[j10];
-                    const float v11 = data[j11];
-
-                    const float v0 = (1-dx)*v00 + dx*v01;
-                    const float v1 = (1-dx)*v10 + dx*v11;
-
-                    const float v = (1-dy)*v0 + dy*v1;
-
-                    if (v > intersection_threshold) {
-                        intersections++;
-                    }
-                    if (v > union_threshold) {
-                        unions++;
-                    }
-                    if (v > mask_threshold) {
-                        min_iy = std::min(min_iy, iy);
-                        max_iy = std::max(max_iy, iy);
-                        min_ix = std::min(min_ix, ix);
-                        max_ix = std::max(max_ix, ix);
-
-                        res.data[iy*nx + ix] = 255;
-                    }
-                }
-            }
-        }
-
-        const float stability_score = float(intersections) / float(unions);
-        if (stability_score_threshold > 0.f && stability_score < stability_score_threshold) {
-            printf("Skipping mask %d with stability score %f below threshold %f\n", i, stability_score, stability_score_threshold);
-            continue; // Filtering masks with stability score below the threshold
-        }
-
-        printf("Mask %d: iou = %f, stability_score = %f, bbox (%d, %d), (%d, %d)\n",
-                i, iou_data[i], stability_score, min_ix, max_ix, min_iy, max_iy);
-
-        std::string filename = "mask_out_" + std::to_string(i) + ".png";
-        if (!stbi_write_png(filename.c_str(), res.nx, res.ny, 1, res.data.data(), res.nx)) {
-            printf("%s: failed to write mask %s\n", __func__, filename.c_str());
-            return false;
-        }
-    }
-
-
-    return true;
-}
-
-struct ggml_cgraph  * sam_build_fast_graph(
-        const sam_model     & model,
-                  sam_state & state,
-                        int   nx,
-                        int   ny,
-                  sam_point   point) {
-
-    struct ggml_init_params ggml_params = {
-        /*.mem_size   =*/ state.buf_compute_fast.size(),
-        /*.mem_buffer =*/ state.buf_compute_fast.data(),
-        /*.no_alloc   =*/ true, // skip allocating as we use ggml_alloc to allocate exact memory requirements
-    };
-
-    struct ggml_context * ctx0   = ggml_init(ggml_params);
-    struct ggml_cgraph  * gf     = ggml_new_graph(ctx0);
-
-    prompt_encoder_result enc_res = sam_encode_prompt(model, ctx0, gf, state, nx, ny, point);
-    if (!enc_res.embd_prompt_sparse || !enc_res.embd_prompt_dense) {
-        fprintf(stderr, "%s: failed to encode prompt\n", __func__);
-        return {};
-    }
-
-    struct ggml_tensor * pe_img_dense = sam_fill_dense_pe(model, ctx0, gf, state);
-    if (!pe_img_dense) {
-        fprintf(stderr, "%s: failed to get dense positional encoding\n", __func__);
-        return {};
-    }
-
-    if (!sam_decode_mask(model, enc_res, pe_img_dense, ctx0, gf, state)) {
-         fprintf(stderr, "%s: failed to decode mask\n", __func__);
-         return {};
-    }
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-struct sam_params {
-    int32_t seed      = -1; // RNG seed
-    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
-
-    std::string model     = "models/sam-vit-b/ggml-model-f16.bin"; // model path
-    std::string fname_inp = "img.jpg";
-    std::string fname_out = "img.out";
-};
-
-void sam_print_usage(int argc, char ** argv, const sam_params & params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
-    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -m FNAME, --model FNAME\n");
-    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "  -i FNAME, --inp FNAME\n");
-    fprintf(stderr, "                        input file (default: %s)\n", params.fname_inp.c_str());
-    fprintf(stderr, "  -o FNAME, --out FNAME\n");
-    fprintf(stderr, "                        output file (default: %s)\n", params.fname_out.c_str());
-    fprintf(stderr, "\n");
-}
-
-bool sam_params_parse(int argc, char ** argv, sam_params & params) {
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-
-        if (arg == "-s" || arg == "--seed") {
-            params.seed = std::stoi(argv[++i]);
-        } else if (arg == "-t" || arg == "--threads") {
-            params.n_threads = std::stoi(argv[++i]);
-        } else if (arg == "-m" || arg == "--model") {
-            params.model = argv[++i];
-        } else if (arg == "-i" || arg == "--inp") {
-            params.fname_inp = argv[++i];
-        } else if (arg == "-o" || arg == "--out") {
-            params.fname_out = argv[++i];
-        } else if (arg == "-h" || arg == "--help") {
-            sam_print_usage(argc, argv, params);
-            exit(0);
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            sam_print_usage(argc, argv, params);
-            exit(0);
-        }
-    }
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    const int64_t t_main_start_us = ggml_time_us();
-
-    sam_params params;
-    params.model = "models/sam-vit-b/ggml-model-f16.bin";
-
-    sam_model model;
-    sam_state state;
-    int64_t t_load_us = 0;
-
-    if (sam_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-    fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
-
-    // load the image
-    sam_image_u8 img0;
-    if (!sam_image_load_from_file(params.fname_inp, img0)) {
-        fprintf(stderr, "%s: failed to load image from '%s'\n", __func__, params.fname_inp.c_str());
-        return 1;
-    }
-    fprintf(stderr, "%s: loaded image '%s' (%d x %d)\n", __func__, params.fname_inp.c_str(), img0.nx, img0.ny);
-
-    // preprocess to f32
-    sam_image_f32 img1;
-    if (!sam_image_preprocess(img0, img1)) {
-        fprintf(stderr, "%s: failed to preprocess image\n", __func__);
-        return 1;
-    }
-    fprintf(stderr, "%s: preprocessed image (%d x %d)\n", __func__, img1.nx, img1.ny);
-
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!sam_model_load(params.model, model)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-    }
-
-    {
-        static size_t buf_size = 256u*1024*1024;
-
-        struct ggml_init_params ggml_params = {
-            /*.mem_size   =*/ buf_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        state.ctx = ggml_init(ggml_params);
-
-        state.embd_img = ggml_new_tensor_3d(state.ctx, GGML_TYPE_F32,
-                model.hparams.n_img_embd(), model.hparams.n_img_embd(), model.hparams.n_enc_out_chans);
-
-        state.low_res_masks = ggml_new_tensor_3d(state.ctx, GGML_TYPE_F32,
-                model.hparams.n_enc_out_chans, model.hparams.n_enc_out_chans, 3);
-
-        state.iou_predictions = ggml_new_tensor_1d(state.ctx, GGML_TYPE_F32, 3);
-    }
-
-
-    static const size_t tensor_alignment = 32;
-    {
-        state.buf_compute_img_enc.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
-        state.allocr = ggml_allocr_new_measure(tensor_alignment);
-        struct ggml_cgraph * gf_measure = sam_encode_image(model, state, img1);
-        if (!gf_measure) {
-            fprintf(stderr, "%s: failed to encode image\n", __func__);
-            return 1;
-        }
-
-        size_t alloc_size = ggml_allocr_alloc_graph(state.allocr, gf_measure) + tensor_alignment;
-        ggml_allocr_free(state.allocr);
-
-        // recreate allocator with exact memory requirements
-        state.buf_alloc_img_enc.resize(alloc_size);
-        state.allocr = ggml_allocr_new(state.buf_alloc_img_enc.data(), state.buf_alloc_img_enc.size(), tensor_alignment);
-
-        // compute the graph with the measured exact memory requirements from above
-        ggml_allocr_reset(state.allocr);
-
-        struct ggml_cgraph  * gf = sam_encode_image(model, state, img1);
-        if (!gf) {
-            fprintf(stderr, "%s: failed to encode image\n", __func__);
-            return 1;
-        }
-
-        ggml_allocr_alloc_graph(state.allocr, gf);
-
-        ggml_graph_compute_helper(state.work_buffer, gf, params.n_threads);
-
-        print_t_f32("embd_img", state.embd_img);
-
-        ggml_allocr_free(state.allocr);
-        state.allocr = NULL;
-        state.work_buffer.clear();
-    }
-    {
-        state.buf_compute_fast.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
-        state.allocr = ggml_allocr_new_measure(tensor_alignment);
-
-        // TODO: user input
-        const sam_point pt = { 414.375f, 162.796875f, };
-        // measure memory requirements for the graph
-        struct ggml_cgraph  * gf_measure = sam_build_fast_graph(model, state, img0.nx, img0.ny, pt);
-        if (!gf_measure) {
-            fprintf(stderr, "%s: failed to build fast graph to measure\n", __func__);
-            return 1;
-        }
-
-        size_t alloc_size = ggml_allocr_alloc_graph(state.allocr, gf_measure) + tensor_alignment;
-        ggml_allocr_free(state.allocr);
-
-        // recreate allocator with exact memory requirements
-        state.buf_alloc_fast.resize(alloc_size);
-        state.allocr = ggml_allocr_new(state.buf_alloc_fast.data(), state.buf_alloc_fast.size(), tensor_alignment);
-
-        // compute the graph with the measured exact memory requirements from above
-        ggml_allocr_reset(state.allocr);
-
-        struct ggml_cgraph  * gf = sam_build_fast_graph(model, state, img0.nx, img0.ny, pt);
-        if (!gf) {
-            fprintf(stderr, "%s: failed to build fast graph\n", __func__);
-            return 1;
-        }
-
-        ggml_allocr_alloc_graph(state.allocr, gf);
-
-        ggml_graph_compute_helper(state.work_buffer, gf, params.n_threads);
-
-        //print_t_f32("iou_predictions", state.iou_predictions);
-        //print_t_f32("low_res_masks", state.low_res_masks);
-        ggml_allocr_free(state.allocr);
-        state.allocr = NULL;
-    }
-
-    if (!sam_write_masks(model.hparams, img0.nx, img0.ny, state)) {
-        fprintf(stderr, "%s: failed to write masks\n", __func__);
-        return 1;
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        fprintf(stderr, "\n\n");
-        fprintf(stderr, "%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        fprintf(stderr, "%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}

+ 0 - 31
ggml/examples/starcoder/CMakeLists.txt

@@ -1,31 +0,0 @@
-#
-# starcoder
-
-set(TEST_TARGET starcoder)
-add_executable(${TEST_TARGET} main.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
-#
-# starcoder-mmap
-
-set(TEST_TARGET starcoder-mmap)
-add_executable(${TEST_TARGET} starcoder-mmap.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
-#
-# starcoder-quantize
-
-set(TEST_TARGET starcoder-quantize)
-add_executable(${TEST_TARGET} quantize.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
-#
-# For GPU offloading
-
-if (GGML_CUBLAS)
-    add_compile_definitions(GGML_USE_CUBLAS)
-endif()
-if (GGML_CLBLAST)
-    add_compile_definitions(GGML_USE_CLBLAST)
-endif()
-

+ 0 - 115
ggml/examples/starcoder/README.md

@@ -1,115 +0,0 @@
-# 💫 StarCoder
-
-This is a C++ example running 💫 StarCoder inference using the [ggml](https://github.com/ggerganov/ggml) library.
-
-The program runs on the CPU - no video card is required.
-
-The example supports the following 💫 StarCoder models:
-
-- `bigcode/starcoder`
-- `bigcode/gpt_bigcode-santacoder` aka the smol StarCoder
-
-Sample performance on MacBook M1 Pro:
-
-TODO
-
-
-Sample output:
-
-```
-$ ./bin/starcoder -h
-usage: ./bin/starcoder [options]
-
-options:
-  -h, --help            show this help message and exit
-  -s SEED, --seed SEED  RNG seed (default: -1)
-  -t N, --threads N     number of threads to use during computation (default: 8)
-  -p PROMPT, --prompt PROMPT
-                        prompt to start generation with (default: random)
-  -n N, --n_predict N   number of tokens to predict (default: 200)
-  --top_k N             top-k sampling (default: 40)
-  --top_p N             top-p sampling (default: 0.9)
-  --temp N              temperature (default: 1.0)
-  -b N, --batch_size N  batch size for prompt processing (default: 8)
-  -m FNAME, --model FNAME
-                        model path (default: models/starcoder-117M/ggml-model.bin)
-
-$ ./bin/starcoder -m ../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin -p "def fibonnaci(" -t 4 --top_k 0 --top_p 0.95 --temp 0.2      
-main: seed = 1683881276
-starcoder_model_load: loading model from '../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin'
-starcoder_model_load: n_vocab = 49280
-starcoder_model_load: n_ctx   = 2048
-starcoder_model_load: n_embd  = 2048
-starcoder_model_load: n_head  = 16
-starcoder_model_load: n_layer = 24
-starcoder_model_load: ftype   = 3
-starcoder_model_load: ggml ctx size = 1794.90 MB
-starcoder_model_load: memory size =   768.00 MB, n_mem = 49152
-starcoder_model_load: model size  =  1026.83 MB
-main: prompt: 'def fibonnaci('
-main: number of tokens in prompt = 7, first 8 tokens: 563 24240 78 2658 64 2819 7 
-
-def fibonnaci(n):
-    if n == 0:
-        return 0
-    elif n == 1:
-        return 1
-    else:
-        return fibonacci(n-1) + fibonacci(n-2)
-
-print(fibo(10))
-
-main: mem per token =  9597928 bytes
-main:     load time =   480.43 ms
-main:   sample time =    26.21 ms
-main:  predict time =  3987.95 ms / 19.36 ms per token
-main:    total time =  4580.56 ms
-```
-
-## Quick start
-```bash
-git clone https://github.com/ggerganov/ggml
-cd ggml
-
-# Install Python dependencies
-python3 -m pip install -r requirements.txt
-
-# Convert HF model to ggml
-python examples/starcoder/convert-hf-to-ggml.py bigcode/gpt_bigcode-santacoder
-
-# Build ggml + examples
-mkdir build && cd build
-cmake .. && make -j4 starcoder starcoder-quantize
-
-# quantize the model
-./bin/starcoder-quantize ../models/bigcode/gpt_bigcode-santacoder-ggml.bin ../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin 3
-
-# run inference
-./bin/starcoder -m ../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin -p "def fibonnaci(" --top_k 0 --top_p 0.95 --temp 0.2
-```
-
-
-## Downloading and converting the original models (💫 StarCoder)
-
-You can download the original model and convert it to `ggml` format using the script `convert-hf-to-ggml.py`:
-
-```
-# Convert HF model to ggml
-python examples/starcoder/convert-hf-to-ggml.py bigcode/gpt_bigcode-santacoder
-```
-
-This conversion requires that you have python and Transformers installed on your computer.
-
-## Quantizing the models
-
-You can also try to quantize the `ggml` models via 4-bit integer quantization.
-
-```
-# quantize the model
-./bin/starcoder-quantize ../models/bigcode/gpt_bigcode-santacoder-ggml.bin ../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin 3
-```
-
-| Model | Original size | Quantized size | Quantization type |
-| --- | --- | --- | --- |
-| `bigcode/gpt_bigcode-santacoder` | 5396.45 MB | 1026.83 MB | 4-bit integer (q4_1) |
-| `bigcode/starcoder` | 71628.23 MB | 13596.23 MB | 4-bit integer (q4_1) |

+ 0 - 208
ggml/examples/starcoder/convert-hf-to-ggml.py

@@ -1,208 +0,0 @@
-# Convert HF models to ggml format
-#
-
-import sys
-import struct
-import json
-import torch
-import numpy as np
-import re
-import os
-import argparse
-
-from transformers import AutoModelForCausalLM
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BloomForCausalLM
-
-# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-parser = argparse.ArgumentParser(description='Convert starcoder HF model to GGML')
-parser.add_argument('model_name_or_path', type=str, help='Name of model on HF hub, or local model folder')
-parser.add_argument('--outfile', type=str, default='ggml-model.bin', help='Path of GGML file to write.')
-parser.add_argument('--use_f32', action="store_true", help='Save GGML file in fp32')
-
-args = parser.parse_args()
-
-# use 16-bit or 32-bit floats
-use_f16 = not args.use_f32
-
-fname_out = args.outfile
-fname_dir = os.path.dirname(fname_out)
-if fname_dir:
-    os.makedirs(fname_dir, exist_ok=True)
-
-print("Loading model: ", args.model_name_or_path)
-tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
-config = AutoConfig.from_pretrained(args.model_name_or_path, trust_remote_code=True)
-hparams = config.to_dict()
-model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, config=config, torch_dtype=torch.float16 if use_f16 else torch.float32, low_cpu_mem_usage=True, trust_remote_code=True, offload_state_dict=True)
-print("Model loaded: ", args.model_name_or_path)
-
-list_vars = model.state_dict()
-
-encoder = tokenizer.vocab
-# Add added_tokens (special tokens) to the encoder
-encoder.update(tokenizer.get_added_vocab())
-print(hparams)
-
-print("Saving ggml model to: ", fname_out)
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-vocab_size = hparams["vocab_size"]
-fout.write(struct.pack("i", vocab_size))
-# fout.write(struct.pack("i", len(encoder)))
-fout.write(struct.pack("i", hparams["n_positions"]))
-fout.write(struct.pack("i", hparams["n_embd"]))
-fout.write(struct.pack("i", hparams["n_head"]))
-fout.write(struct.pack("i", hparams["n_layer"]))
-fout.write(struct.pack("i", use_f16))
-
-byte_encoder = bytes_to_unicode()
-byte_decoder = {v:k for k, v in byte_encoder.items()}
-
-fout.write(struct.pack("i", vocab_size))
-
-counter = 0
-# sort by value
-for key in sorted(encoder, key=encoder.get):
-    text = bytearray([byte_decoder[c] for c in key])
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-    counter += 1
-
-# TODO: Repeat last token until vocab_size
-while counter < vocab_size:
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-    counter += 1
-# assert counter == config.vocab_size
-
-for name in list_vars.keys():
-    data = list_vars[name].squeeze().numpy()
-    print("Processing variable: " + name + " with shape: ", data.shape)
-
-    # rename headers to keep compatibility
-    if name == "transformer.ln_f.weight":
-        name = "model/ln_f/g"
-    elif name == "transformer.ln_f.bias":
-        name = "model/ln_f/b"
-    elif name == "transformer.wte.weight":
-        name = "model/wte"
-    elif name == "transformer.wpe.weight":
-        name = "model/wpe"
-    elif name == "lm_head.weight":
-        name = "model/lm_head"
-    elif re.match(r"transformer.h\.\d+\.ln_1\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_1/g"
-    elif re.match(r"transformer.h\.\d+\.ln_1\.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_1/b"
-    elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_attn/w"
-    elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_attn/b"
-    elif re.match(r"transformer.h\.\d+\.attn\.c_proj\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_proj/w"
-    elif re.match(r"transformer.h.\d+.attn.c_proj.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_proj/b"
-    elif re.match(r"transformer.h.\d+.ln_2.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_2/g"
-    elif re.match(r"transformer.h.\d+.ln_2.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_2/b"
-    elif re.match(r"transformer.h.\d+.mlp.c_fc.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_fc/w"
-    elif re.match(r"transformer.h.\d+.mlp.c_fc.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_fc/b"
-    elif re.match(r"transformer.h.\d+.mlp.c_proj.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_proj/w"
-    elif re.match(r"transformer.h.\d+.mlp.c_proj.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_proj/b"
-    else:
-        print("Unrecognized variable name. %s", name)
-
-    # we don't need these
-    if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
-        print("  Skipping variable: " + name)
-        continue
-
-    n_dims = len(data.shape);
-
-    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype = 0;
-    if use_f16:
-        if (name == "model/wte" or name == "model/lm_head" or name[-2:] == "/g" or name[-2:] == "/w") and n_dims == 2:
-            print("  Converting to float16")
-            data = data.astype(np.float16)
-            ftype = 1
-        else:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype = 0
-
-    "model/h.*/attn/c_attn/w"
-    "model/h.*/attn/c_proj/w"
-    "model/h.*/mlp/c_fc/w"
-    "model/h.*/mlp/c_proj/w"
-    if name[-14:] == "/attn/c_attn/w" or name[-14:] == "/attn/c_attn/b":
-        print("  Duplicate K,V heads to use MHA instead of MQA")
-
-        embed_dim = hparams["n_embd"]
-        head_dim = embed_dim // hparams["n_head"]
-
-        # ((n_heads + 2) * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim)
-        q, k ,v = np.split(data, (hparams["n_head"] * head_dim, (hparams["n_head"] + 1) * head_dim), axis=0)
-        # duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim)
-        if len(k.shape) == 2:
-            k = np.tile(k, (hparams["n_head"], 1))
-            v = np.tile(v, (hparams["n_head"], 1))
-        elif len(k.shape) == 1:
-            k = np.tile(k, (hparams["n_head"]))
-            v = np.tile(v, (hparams["n_head"]))
-        # concat q, k, v along the first axis (n_heads * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim)
-        data = np.concatenate((q, k, v), axis=0)
-
-    # header
-    str = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    fout.write(str);
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")

+ 0 - 927
ggml/examples/starcoder/main.cpp

@@ -1,927 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// default hparams (GPT-2 117M)
-// https://huggingface.co/bigcode/gpt_bigcode-santacoder/blob/main/config.json
-struct starcoder_hparams {
-    int32_t n_vocab = 49280;
-    int32_t n_ctx   = 2048;
-    int32_t n_embd  = 2048;
-    int32_t n_head  = 16;
-    int32_t n_layer = 24;
-    int32_t ftype   = 1;
-    float   eps     = 1e-5f;
-};
-
-struct starcoder_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // mlp
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct starcoder_model {
-    starcoder_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte;     // position embedding
-    struct ggml_tensor * wpe;     //    token embedding
-    struct ggml_tensor * lm_head; // language model head
-
-    std::vector<starcoder_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-// load the model's weights from a file
-bool starcoder_model_load(const std::string & fname, starcoder_model & model, gpt_vocab & vocab) {
-    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr   = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        fin.read((char *) &n_vocab, sizeof(n_vocab));
-
-        if (n_vocab != model.hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
-            return false;
-        }
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *) buf.data(), len);
-            word.assign(buf.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-
-            // if (i < 10) fprintf(stderr, "%.s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
-        }
-
-        // Add StarChat special tokens.
-        for (std::string token : {
-                "<|system|>",
-                "<|user|>",
-                "<|assistant|>",
-                "<|end|>",
-                "<fim-prefix>",
-                "<fim-middle>",
-                "<fim-suffix>",
-                "<fim-pad>",
-                "<|end_of_turn|>"
-            }) {
-            if (vocab.token_to_id.find(token) != vocab.token_to_id.end()) {
-                vocab.add_special_token(token);
-            }
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        const int head_dim = n_embd / hparams.n_head;
-        const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head
-        const int kv_dim   = kv_heads * head_dim;
-
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
-
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
-
-        ctx_size += n_layer*((n_embd + 2*kv_dim)*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w // TODO:
-        ctx_size += n_layer*(       (n_embd + 2*kv_dim)*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
-
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
-
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
-
-        ctx_size += (6 + 12*n_layer)*512; // object overhead
-
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        const int head_dim = n_embd / hparams.n_head;
-        const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head
-        const int kv_dim   = kv_heads * head_dim;
-
-        model.layers.resize(n_layer);
-
-        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
-        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-
-        // map by name
-        model.tensors["model/ln_f/g"] = model.ln_f_g;
-        model.tensors["model/ln_f/b"] = model.ln_f_b;
-
-        model.tensors["model/wte"]     = model.wte;
-        model.tensors["model/wpe"]     = model.wpe;
-        model.tensors["model/lm_head"] = model.lm_head;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd + 2*kv_dim);
-            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd + 2*kv_dim);
-
-            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
-            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd); //TODO: 4*n_embd = config.n_inner
-            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            // map by name
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/b"]        = layer.ln_1_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/g"]        = layer.ln_2_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/b"]        = layer.ln_2_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int n_mem      = n_layer*n_ctx;
-        const int n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-
-    // load weights
-    {
-        size_t total_size = 0;
-
-        bool has_lm_head = false;
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            auto tensor = model.tensors[name];
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file. got %d, expected %d\n",
-                        __func__, name.c_str(), (int) ggml_nelements(tensor), nelements);
-                return false;
-            }
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-            // GPT-2 models share the WTE tensor as the LM head
-            if (name == "model/wte" && has_lm_head == false) {
-                memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
-            }
-
-            if (name == "model/lm_head") {
-                has_lm_head = true;
-            }
-
-            total_size += ggml_nbytes(tensor);
-        }
-
-        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool starcoder_eval(
-        const starcoder_model & model,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w,
-              size_t                     & mem_per_token) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-    const int n_vocab = hparams.n_vocab;
-
-    static size_t buf_size = 256u*1024*1024;
-    static void * buf = malloc(buf_size);
-
-    // use 2 scratch buffers
-    // TODO: very hacky solution - reimplement in a more elegant way
-    static size_t scr0_size = 256u*1024*1024;
-    static void * scr0 = malloc(scr0_size);
-
-    static size_t scr1_size = 256u*1024*1024;
-    static void * scr1 = malloc(scr1_size);
-
-    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = {};
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    for (int i = 0; i < N; ++i) {
-        ((int32_t *) position->data)[i] = n_past + i;
-    }
-
-    // wte + wpe
-    struct ggml_tensor * inpL =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.wte, embd),
-                ggml_get_rows(ctx0, model.wpe, position));
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
-
-        // norm
-        {
-            // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
-
-            // cur = ln_1_g*cur + ln_1_b
-            // [ 768, N]
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
-                        cur),
-                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
-        }
-
-        // attn
-        // [2304, 768] - model.layers[il].c_attn_attn_w
-        // [2304,   1] - model.layers[il].c_attn_attn_b
-        // [ 768,   N] - cur (in)
-        // [2304,   N] - cur (out)
-        //
-        // cur = attn_w*cur + attn_b
-        // [2304, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_attn_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
-                    cur);
-        }
-
-        // self-attention
-        {
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
-
-            // store key and value to memory
-            if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
-
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            // [64, N, 12]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
-                        0, 2, 1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            // [64, n_past + N, 12]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3); //TODO: need to be tiled
-
-            // GG: flash attention
-            //struct ggml_tensor * V =
-            //    ggml_cpy(ctx0,
-            //            ggml_permute(ctx0,
-            //                ggml_reshape_3d(ctx0,
-            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-            //                    n_embd/n_head, n_head, n_past + N),
-            //                1, 2, 0, 3),
-            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
-
-            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
-
-            // K * Q
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); //TODO: check if it broadcasts
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
-                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
-                        );
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            // [n_past + N, 64, 12]
-            struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-                                n_embd/n_head, n_head, n_past + N),
-                            1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
-
-            // KQV = transpose(V) * KQ_soft_max
-            // [64, N, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // [64, 12, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // [768, N]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-        }
-
-        // projection
-        // [ 768, 768] - model.layers[il].c_attn_proj_w
-        // [ 768,   1] - model.layers[il].c_attn_proj_b
-        // [ 768,   N] - cur (in)
-        // [ 768,   N] - cur (out)
-        //
-        // cur = proj_w*cur + proj_b
-        // [768, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
-                    cur);
-        }
-
-        // add the input
-        cur = ggml_add(ctx0, cur, inpL);
-
-        struct ggml_tensor * inpFF = cur;
-
-        ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
-
-        // feed-forward network
-        {
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
-
-                // cur = ln_2_g*cur + ln_2_b
-                // [ 768, N]
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
-                            cur),
-                        ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
-            }
-
-            // fully connected
-            // [3072, 768] - model.layers[il].c_mlp_fc_w
-            // [3072,   1] - model.layers[il].c_mlp_fc_b
-            // [ 768,   N] - cur (in)
-            // [3072,   N] - cur (out)
-            //
-            // cur = fc_w*cur + fc_b
-            // [3072, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_fc_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
-                    cur);
-
-            // GELU activation
-            // [3072, N]
-            cur = ggml_gelu(ctx0, cur);
-
-            // projection
-            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
-            // [ 768,    1] - model.layers[il].c_mlp_proj_b
-            // [3072,    N] - cur (in)
-            // [ 768,    N] - cur (out)
-            //
-            // cur = proj_w*cur + proj_b
-            // [768, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
-                    cur);
-        }
-
-        // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
-    }
-
-    ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
-
-    // norm
-    {
-        // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        // [ 768, N]
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
-                    inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
-    }
-
-    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-
-    // inpL = WTE * inpL
-    // [ 768, 50257] - model.lm_head
-    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
-
-    // logits -> probs
-    //inpL = ggml_soft_max_inplace(ctx0, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // return result just for the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
-    }
-    //printf("used_mem = %zu MB\n", ggml_used_mem(ctx0)/(1024*1024));
-
-    ggml_free(ctx0);
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    starcoder_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!starcoder_model_load(params.model, model, vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-        test_gpt_tokenizer(vocab, params.token_test);
-    }
-
-    if (params.repeat_last_n == -1) {
-        params.repeat_last_n = model.hparams.n_ctx;
-    }
-    printf("\n");
-    printf("%s: temp           = %.3f\n", __func__, params.temp);
-    printf("%s: top_k          = %d\n",   __func__, params.top_k);
-    printf("%s: top_p          = %.3f\n", __func__, params.top_p);
-    printf("%s: repeat_last_n  = %d\n",   __func__, params.repeat_last_n);
-    printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty);
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    std::vector<int32_t> last_n_tokens(model.hparams.n_ctx);
-    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-    for (size_t i = 0; i < embd_inp.size(); i++) {
-        printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
-    }
-    printf("\n\n");
-
-    // Handle StarChat "<|end|>" and OpenCoder "<|end_of_turn>" tokens.
-    gpt_vocab::id starchat_end_token = -1;
-    {
-        const auto it = vocab.token_to_id.find("<|end|>");
-        if (it != vocab.token_to_id.end()) {
-            starchat_end_token = it->second;
-        } else {
-            const auto eot_token_id = vocab.token_to_id.find("<|end_of_turn|>");
-            if (eot_token_id != vocab.token_to_id.end()) {
-              starchat_end_token = eot_token_id->second;
-            }
-        }
-    }
-
-    // submit the input prompt token-by-token
-    // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
-    std::vector<gpt_vocab::id> embd;
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-    starcoder_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
-
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!starcoder_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int   top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp  = params.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p_repeat(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, params.repeat_last_n, params.repeat_penalty, rng);
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-
-            last_n_tokens.erase(last_n_tokens.begin());
-            last_n_tokens.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-
-                last_n_tokens.erase(last_n_tokens.begin());
-                last_n_tokens.push_back(embd_inp[k]);
-
-                if (int32_t(embd.size()) >= params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str());
-        }
-        fflush(stdout);
-
-        // check if model is santacoder
-        if (model.hparams.n_layer <= 30 && embd.back() == 49152) {
-            break;
-        }
-        // check if model is starcoder
-        else if (embd.back() == 0) { //TODO: this is only for starcoder
-            break;
-        }
-        // Handle StarChat "<|end|>" token.
-        else if (embd.back() == starchat_end_token && i >= embd_inp.size()) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}

+ 0 - 184
ggml/examples/starcoder/quantize.cpp

@@ -1,184 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include <regex>
-
-// default hparams (GPT-2 117M)
-struct starcoder_hparams {
-    int32_t n_vocab = 49280;
-    int32_t n_ctx   = 2048;
-    int32_t n_embd  = 2048;
-    int32_t n_head  = 16;
-    int32_t n_layer = 24;
-    int32_t ftype   = 1;
-};
-
-// quantize a model
-bool starcoder_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
-    gpt_vocab vocab;
-
-    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
-
-    auto finp = std::ifstream(fname_inp, std::ios::binary);
-    if (!finp) {
-        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    auto fout = std::ofstream(fname_out, std::ios::binary);
-    if (!fout) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        finp.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
-            return false;
-        }
-
-        fout.write((char *) &magic, sizeof(magic));
-    }
-
-    starcoder_hparams hparams;
-
-    // load hparams
-    {
-        finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        finp.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
-        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
-
-        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
-        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
-        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
-
-        fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        finp.read ((char *) &n_vocab, sizeof(n_vocab));
-        fout.write((char *) &n_vocab, sizeof(n_vocab));
-
-        if (n_vocab != hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
-            return false;
-        }
-
-        std::string word;
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            finp.read ((char *) &len, sizeof(len));
-            fout.write((char *) &len, sizeof(len));
-
-            word.resize(len);
-            finp.read ((char *) word.data(), len);
-            fout.write((char *) word.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // regexes of tensor names to be quantized
-    const std::vector<std::string> to_quant = {
-        "model/wte",
-        "model/lm_head",
-        "model/h.*/attn/c_attn/w",
-        "model/h.*/attn/c_proj/w",
-        "model/h.*/mlp/c_fc/w",
-        "model/h.*/mlp/c_proj/w",
-    };
-
-    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
-        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    finp.close();
-    fout.close();
-
-    return true;
-}
-
-// usage:
-//  ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type
-//
-int main(int argc, char ** argv) {
-    if (argc != 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        ggml_print_ftypes(stderr);
-        return 1;
-    }
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
-
-    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    int64_t t_quantize_us = 0;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!starcoder_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
-            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
-            return 1;
-        }
-
-        t_quantize_us = ggml_time_us() - t_start_us;
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n");
-        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    return 0;
-}

+ 0 - 1127
ggml/examples/starcoder/starcoder-mmap.cpp

@@ -1,1127 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-
-#if !defined(_WIN32)
-// mmap
-#include <sys/types.h>
-#include <sys/mman.h>
-#include <unistd.h>
-#include <fcntl.h>
-#else
-#define NOMINMAX
-#include <Windows.h>
-#endif
-
-#ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_CLBLAST
-#include "ggml-opencl.h"
-#endif
-
-// default hparams (GPT-2 117M)
-// https://huggingface.co/bigcode/gpt_bigcode-santacoder/blob/main/config.json
-struct starcoder_hparams {
-    int32_t n_vocab = 49280;
-    int32_t n_ctx   = 2048;
-    int32_t n_embd  = 2048;
-    int32_t n_head  = 16;
-    int32_t n_layer = 24;
-    int32_t ftype   = 1;
-    float   eps     = 1e-5f;
-};
-
-struct starcoder_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // mlp
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct llama_buffer {
-    uint8_t * addr = NULL;
-    size_t size = 0;
-
-    llama_buffer() = default;
-
-    void resize(size_t len) {
-#ifdef GGML_USE_METAL
-        free(addr);
-        int result = posix_memalign((void **) &addr, getpagesize(), len);
-        if (result == 0) {
-            memset(addr, 0, len);
-        }
-        else {
-            addr = NULL;
-        }
-#else
-        delete[] addr;
-        addr = new uint8_t[len];
-#endif
-        size = len;
-    }
-
-    ~llama_buffer() {
-#ifdef GGML_USE_METAL
-        free(addr);
-#else
-        delete[] addr;
-#endif
-        addr = NULL;
-    }
-
-    // disable copy and move
-    llama_buffer(const llama_buffer&) = delete;
-    llama_buffer(llama_buffer&&) = delete;
-    llama_buffer& operator=(const llama_buffer&) = delete;
-    llama_buffer& operator=(llama_buffer&&) = delete;
-};
-
-
-struct kv_cache {
-    struct ggml_tensor * k;
-    struct ggml_tensor * v;
-
-    struct ggml_context * ctx = NULL;
-
-    //std::vector<uint8_t> buf;
-    llama_buffer buf;
-
-    int n;
-};
-
-struct starcoder_model {
-    starcoder_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte;     // position embedding
-    struct ggml_tensor * wpe;     //    token embedding
-    struct ggml_tensor * lm_head; // language model head
-
-    std::vector<starcoder_layer> layers;
-
-    // key + value memory
-    //struct ggml_tensor * memory_k;
-    //struct ggml_tensor * memory_v;
-    struct kv_cache cache;
-
-    // model memory mapped file
-    void * mm_addr = NULL;
-    uint64_t mm_length = 0;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-// From PR #613 (https://github.com/ggerganov/llama.cpp/pull/613)
-static void *mmap_file(const char *fname, uint64_t *mm_length) {
-#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
-    HANDLE hFile = CreateFileA(fname,
-                               GENERIC_READ,
-                               FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
-                               NULL,
-                               OPEN_EXISTING,
-                               FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
-                               NULL);
-    if (hFile == INVALID_HANDLE_VALUE) return 0;
-    LARGE_INTEGER fileSize;
-    fileSize.QuadPart = -1;
-    GetFileSizeEx(hFile, &fileSize);
-    int64_t length = fileSize.QuadPart;
-    HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
-    CloseHandle(hFile);
-    if (!hMapping) return 0;
-    void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
-    CloseHandle(hMapping);
-    if (!addr) return 0;
-#else
-    int fd = open(fname, O_RDONLY);
-    if (fd == -1) return 0;
-    int64_t length = lseek(fd, 0, SEEK_END);
-    void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
-    close(fd);
-    if (addr == MAP_FAILED) return 0;
-#endif
-    *mm_length = length;
-    return addr;
-}
-
-static void munmap_file(void * addr, size_t length) {
-#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
-    UnmapViewOfFile(addr);
-#else
-    munmap(addr, length);
-#endif
-}
-
-// load the model's weights from a file
-bool starcoder_model_load(const std::string & fname, starcoder_model & model, gpt_vocab & vocab, int32_t n_gpu_layers) {
-    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    std::vector<char> f_buf(1024*1024);
-    fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
-
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        //if (magic != 0x67676a74) {
-        if (magic != 0x67676d6c) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr   = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        fin.read((char *) &n_vocab, sizeof(n_vocab));
-
-        if (n_vocab != model.hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
-            return false;
-        }
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *) buf.data(), len);
-            word.assign(buf.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-
-            // if (i < 10) fprintf(stderr, "%.s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
-        }
-
-        // Add StarChat special tokens.
-        for (std::string token : {
-                "<|system|>",
-                "<|user|>",
-                "<|assistant|>",
-                "<|end|>",
-                }) {
-            if (vocab.token_to_id.find(token) != vocab.token_to_id.end()) {
-                vocab.add_special_token(token);
-            }
-        }
-    }
-
-    char *mm_addr = NULL;
-    model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
-    if (model.mm_addr == NULL) {
-        fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-    mm_addr = (char *)model.mm_addr;
-    fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_layer = hparams.n_layer;
-
-
-        /*
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        const int head_dim = n_embd / hparams.n_head;
-        const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head
-        const int kv_dim   = kv_heads * head_dim;
-
-
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
-
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
-
-        ctx_size += n_layer*((n_embd + 2*kv_dim)*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w // TODO:
-        ctx_size += n_layer*(       (n_embd + 2*kv_dim)*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
-
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
-
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
-        */
-
-        ctx_size += (6 + 12*n_layer)*512; // object overhead
-
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ true,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        const int head_dim = n_embd / hparams.n_head;
-        const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head
-        const int kv_dim   = kv_heads * head_dim;
-
-        model.layers.resize(n_layer);
-
-        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
-        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-
-        // map by name
-        model.tensors["model/ln_f/g"] = model.ln_f_g;
-        model.tensors["model/ln_f/b"] = model.ln_f_b;
-
-        model.tensors["model/wte"]     = model.wte;
-        model.tensors["model/wpe"]     = model.wpe;
-        model.tensors["model/lm_head"] = model.lm_head;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd + 2*kv_dim);
-            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd + 2*kv_dim);
-
-            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
-            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd); //TODO: 4*n_embd = config.n_inner
-            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            // map by name
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/b"]        = layer.ln_1_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/g"]        = layer.ln_2_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/b"]        = layer.ln_2_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int n_mem      = n_layer*n_ctx;
-        const int n_elements = n_embd*n_mem;
-
-        model.cache.buf.resize(2u*n_elements*ggml_type_size(GGML_TYPE_F16) + 2u*1024*1024);
-
-        struct ggml_init_params c_params;
-        c_params.mem_size   = model.cache.buf.size;
-        c_params.mem_buffer = model.cache.buf.addr;
-        c_params.no_alloc   = false;
-
-        model.cache.ctx = ggml_init(c_params);
-
-        if (!model.cache.ctx) {
-            fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
-            return false;
-        }
-
-        model.cache.k = ggml_new_tensor_1d(model.cache.ctx, GGML_TYPE_F16, n_elements);
-        model.cache.v = ggml_new_tensor_1d(model.cache.ctx, GGML_TYPE_F16, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.cache.k) + ggml_nbytes(model.cache.v);
-
-        printf("%s: kv_cache memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-
-    // load weights
-    {
-        size_t total_size = 0;
-
-        bool has_lm_head = false;
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name.data()) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
-                return false;
-            }
-
-            auto tensor = model.tensors[name.data()];
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file. got %d, expected %d\n",
-                        __func__, name.data(), (int) ggml_nelements(tensor), nelements);
-                return false;
-            }
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            // mmap
-            size_t offset = fin.tellg();
-            size_t tensor_data_size = ggml_nbytes(tensor);
-            //offset = (offset + 31) & -32;
-            tensor->data = mm_addr + offset;
-            fin.seekg(offset + tensor_data_size);
-            total_size += tensor_data_size;
-
-            // GPT-2 models share the WTE tensor as the LM head
-            if (name == "model/wte" && has_lm_head == false) {
-                // Dont know if this is required, test models have an lm_head
-                model.lm_head->data = tensor->data;
-            }
-
-            if (name == "model/lm_head") {
-                has_lm_head = true;
-            }
-        }
-
-        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
-    }
-
-    fin.close();
-
-#ifdef GGML_USE_CUBLAS
-    {
-        const auto & hparams = model.hparams;
-        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
-
-        fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
-
-        size_t vram_total = 0;
-
-        for (int i = 0; i < n_gpu; ++i) {
-            const auto & layer = model.layers[i];
-
-            layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
-            ggml_cuda_transform_tensor((uint8_t *)layer.c_attn_attn_w->data, layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
-
-            layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
-            ggml_cuda_transform_tensor((uint8_t *)layer.c_attn_proj_w->data, layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
-
-            layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
-            ggml_cuda_transform_tensor((uint8_t *)layer.c_mlp_fc_w->data, layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
-
-            layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
-            ggml_cuda_transform_tensor((uint8_t *)layer.c_mlp_proj_w->data, layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
-        }
-
-        ggml_cuda_set_scratch_size(0); // disable scratch
-
-        //if (n_gpu_layers > (int) hparams.n_layer) {
-        //    fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
-        //    ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
-        //}
-
-        fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
-    }
-#elif defined(GGML_USE_CLBLAST)
-    //From koboldcpp
-    {
-        const auto & hparams = model.hparams;
-        size_t vram_total = 0;
-        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
-        fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
-        for (int i = 0; i < n_gpu; ++i) {
-            const auto & layer = model.layers[i];
-            layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
-            layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
-            layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
-            layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
-            ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
-            ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
-            ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
-            ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
-        }
-        fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
-    }
-    #endif
-
-    return true;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool starcoder_eval(
-        const starcoder_model & model,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w,
-              size_t                     & mem_per_token) {
-
-    const int N = int(embd_inp.size());
-
-    const auto & hparams = model.hparams;
-
-    auto & cache = model.cache;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-    const int n_vocab = hparams.n_vocab;
-
-    // Scratch is too small for large n_batch (256)
-    //static size_t buf_size = 256u*1024*1024;
-    static size_t buf_size = 256u*1024*1024*2;
-    static void * buf = malloc(buf_size);
-
-    // use 2 scratch buffers
-    // TODO: very hacky solution - reimplement in a more elegant way
-    static size_t scratch0_size = 256u*1024*1024*2;
-    static void * scratch0 = malloc(scratch0_size);
-
-    static size_t scratch1_size = 256u*1024*1024*2;
-    static void * scratch1 = malloc(scratch1_size);
-
-    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = size_t(1.1*(mem_per_token*N)); // add 10% to account for ggml object overhead
-        printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = {};
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-
-
-    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    for (int i = 0; i < N; ++i) {
-        ((int32_t *) position->data)[i] = n_past + i;
-    }
-
-    // wte + wpe
-    struct ggml_tensor * inpL =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.wte, embd),
-                ggml_get_rows(ctx0, model.wpe, position));
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        ggml_set_scratch(ctx0, { 0, scratch0_size, scratch0, });
-
-        // norm
-        {
-            // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
-
-            // cur = ln_1_g*cur + ln_1_b
-            // [ 768, N]
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
-                        cur),
-                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
-        }
-
-        // attn
-        // [2304, 768] - model.layers[il].c_attn_attn_w
-        // [2304,   1] - model.layers[il].c_attn_attn_b
-        // [ 768,   N] - cur (in)
-        // [2304,   N] - cur (out)
-        //
-        // cur = attn_w*cur + attn_b
-        // [2304, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_attn_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
-                    cur);
-        }
-
-        // self-attention
-        {
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
-
-            // store key and value to memory
-            if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, cache.k, N*n_embd, (ggml_element_size(cache.k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, cache.v, N*n_embd, (ggml_element_size(cache.v)*n_embd)*(il*n_ctx + n_past));
-
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            // [64, N, 12]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
-                        0, 2, 1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            // [64, n_past + N, 12]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, cache.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(cache.k)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3); //TODO: need to be tiled
-
-            // GG: flash attention
-            //struct ggml_tensor * V =
-            //    ggml_cpy(ctx0,
-            //            ggml_permute(ctx0,
-            //                ggml_reshape_3d(ctx0,
-            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-            //                    n_embd/n_head, n_head, n_past + N),
-            //                1, 2, 0, 3),
-            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
-
-            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
-
-            // K * Q
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); //TODO: check if it broadcasts
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
-                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
-                        );
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            // [n_past + N, 64, 12]
-            struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, cache.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(cache.v)*n_embd),
-                                n_embd/n_head, n_head, n_past + N),
-                            1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, cache.v->type, n_past + N, n_embd/n_head, n_head));
-
-            // KQV = transpose(V) * KQ_soft_max
-            // [64, N, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // [64, 12, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // [768, N]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-        }
-
-        // projection
-        // [ 768, 768] - model.layers[il].c_attn_proj_w
-        // [ 768,   1] - model.layers[il].c_attn_proj_b
-        // [ 768,   N] - cur (in)
-        // [ 768,   N] - cur (out)
-        //
-        // cur = proj_w*cur + proj_b
-        // [768, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
-                    cur);
-        }
-
-        // add the input
-        cur = ggml_add(ctx0, cur, inpL);
-
-        struct ggml_tensor * inpFF = cur;
-
-        ggml_set_scratch(ctx0, { 0, scratch1_size, scratch1, });
-
-        // feed-forward network
-        {
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
-
-                // cur = ln_2_g*cur + ln_2_b
-                // [ 768, N]
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
-                            cur),
-                        ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
-            }
-
-            // fully connected
-            // [3072, 768] - model.layers[il].c_mlp_fc_w
-            // [3072,   1] - model.layers[il].c_mlp_fc_b
-            // [ 768,   N] - cur (in)
-            // [3072,   N] - cur (out)
-            //
-            // cur = fc_w*cur + fc_b
-            // [3072, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_fc_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
-                    cur);
-
-            // GELU activation
-            // [3072, N]
-            cur = ggml_gelu(ctx0, cur);
-
-            // projection
-            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
-            // [ 768,    1] - model.layers[il].c_mlp_proj_b
-            // [3072,    N] - cur (in)
-            // [ 768,    N] - cur (out)
-            //
-            // cur = proj_w*cur + proj_b
-            // [768, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
-                    cur);
-        }
-
-        // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
-    }
-
-    ggml_set_scratch(ctx0, { 0, scratch0_size, scratch0, });
-
-    // norm
-    {
-        // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        // [ 768, N]
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
-                    inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
-    }
-
-    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-
-    // inpL = WTE * inpL
-    // [ 768, 50257] - model.lm_head
-    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
-
-    // logits -> probs
-    //inpL = ggml_soft_max_inplace(ctx0, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // return result just for the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
-    }
-    //printf("used_mem = %zu MB\n", ggml_used_mem(ctx0)/(1024*1024));
-
-    ggml_free(ctx0);
-
-    return true;
-}
-
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-    params.model = "models/gpt-2-117M/ggml-model.bin";
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = int(time(NULL));
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    starcoder_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!starcoder_model_load(params.model, model, vocab, params.n_gpu_layers)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-        test_gpt_tokenizer(vocab, params.token_test);
-    }
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-    for (size_t i = 0; i < embd_inp.size(); i++) {
-        printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
-    }
-    printf("\n\n");
-
-    // Handle StarChat "<|end|>" token.
-    gpt_vocab::id starchat_end_token = -1;
-    {
-        const auto it = vocab.token_to_id.find("<|end|>");
-        if (it != vocab.token_to_id.end()) {
-            starchat_end_token = it->second;
-        }
-    }
-
-    // submit the input prompt token-by-token
-    // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
-    std::vector<gpt_vocab::id> embd;
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-    printf("Calling starcoder_eval\n");
-    starcoder_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
-
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!starcoder_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-
-            // Should input processing count towards t_predict?
-            if (i > embd_inp.size()) {
-                t_predict_us += ggml_time_us() - t_start_us;
-            }
-        }
-
-        n_past += int(embd.size());
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int   top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp  = params.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (int32_t(embd.size()) >= params.n_batch) {
-                    break;
-                }
-            }
-            i += int(embd.size()) - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str());
-        }
-        fflush(stdout);
-
-        // check if model is santacoder
-        if (model.hparams.n_layer <= 30 && embd.back() == 49152) {
-            break;
-        }
-        // check if model is starcoder
-        else if (embd.back() == 0) { //TODO: this is only for starcoder
-            break;
-        }
-        // Handle StarChat "<|end|>" token.
-        else if (embd.back() == starchat_end_token) {
-            //break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        //Shouldnt the input prompt be subracted?
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(n_past - embd_inp.size()));
-        //printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    if (model.mm_addr) {
-	    munmap_file(model.mm_addr, model.mm_length);
-    }
-
-    return 0;
-}

+ 0 - 7987
ggml/examples/stb_image.h

@@ -1,7987 +0,0 @@
-/* stb_image - v2.28 - public domain image loader - http://nothings.org/stb
-                                  no warranty implied; use at your own risk
-
-   Do this:
-      #define STB_IMAGE_IMPLEMENTATION
-   before you include this file in *one* C or C++ file to create the implementation.
-
-   // i.e. it should look like this:
-   #include ...
-   #include ...
-   #include ...
-   #define STB_IMAGE_IMPLEMENTATION
-   #include "stb_image.h"
-
-   You can #define STBI_ASSERT(x) before the #include to avoid using assert.h.
-   And #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using malloc,realloc,free
-
-
-   QUICK NOTES:
-      Primarily of interest to game developers and other people who can
-          avoid problematic images and only need the trivial interface
-
-      JPEG baseline & progressive (12 bpc/arithmetic not supported, same as stock IJG lib)
-      PNG 1/2/4/8/16-bit-per-channel
-
-      TGA (not sure what subset, if a subset)
-      BMP non-1bpp, non-RLE
-      PSD (composited view only, no extra channels, 8/16 bit-per-channel)
-
-      GIF (*comp always reports as 4-channel)
-      HDR (radiance rgbE format)
-      PIC (Softimage PIC)
-      PNM (PPM and PGM binary only)
-
-      Animated GIF still needs a proper API, but here's one way to do it:
-          http://gist.github.com/urraka/685d9a6340b26b830d49
-
-      - decode from memory or through FILE (define STBI_NO_STDIO to remove code)
-      - decode from arbitrary I/O callbacks
-      - SIMD acceleration on x86/x64 (SSE2) and ARM (NEON)
-
-   Full documentation under "DOCUMENTATION" below.
-
-
-LICENSE
-
-  See end of file for license information.
-
-RECENT REVISION HISTORY:
-
-      2.28  (2023-01-29) many error fixes, security errors, just tons of stuff
-      2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
-      2.26  (2020-07-13) many minor fixes
-      2.25  (2020-02-02) fix warnings
-      2.24  (2020-02-02) fix warnings; thread-local failure_reason and flip_vertically
-      2.23  (2019-08-11) fix clang static analysis warning
-      2.22  (2019-03-04) gif fixes, fix warnings
-      2.21  (2019-02-25) fix typo in comment
-      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
-      2.19  (2018-02-11) fix warning
-      2.18  (2018-01-30) fix warnings
-      2.17  (2018-01-29) bugfix, 1-bit BMP, 16-bitness query, fix warnings
-      2.16  (2017-07-23) all functions have 16-bit variants; optimizations; bugfixes
-      2.15  (2017-03-18) fix png-1,2,4; all Imagenet JPGs; no runtime SSE detection on GCC
-      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
-      2.13  (2016-12-04) experimental 16-bit API, only for PNG so far; fixes
-      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
-      2.11  (2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64
-                         RGB-format JPEG; remove white matting in PSD;
-                         allocate large structures on the stack;
-                         correct channel count for PNG & BMP
-      2.10  (2016-01-22) avoid warning introduced in 2.09
-      2.09  (2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED
-
-   See end of file for full revision history.
-
-
- ============================    Contributors    =========================
-
- Image formats                          Extensions, features
-    Sean Barrett (jpeg, png, bmp)          Jetro Lauha (stbi_info)
-    Nicolas Schulz (hdr, psd)              Martin "SpartanJ" Golini (stbi_info)
-    Jonathan Dummer (tga)                  James "moose2000" Brown (iPhone PNG)
-    Jean-Marc Lienher (gif)                Ben "Disch" Wenger (io callbacks)
-    Tom Seddon (pic)                       Omar Cornut (1/2/4-bit PNG)
-    Thatcher Ulrich (psd)                  Nicolas Guillemot (vertical flip)
-    Ken Miller (pgm, ppm)                  Richard Mitton (16-bit PSD)
-    github:urraka (animated gif)           Junggon Kim (PNM comments)
-    Christopher Forseth (animated gif)     Daniel Gibson (16-bit TGA)
-                                           socks-the-fox (16-bit PNG)
-                                           Jeremy Sawicki (handle all ImageNet JPGs)
- Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
-    Fabian "ryg" Giesen                    Anael Seghezzi (is-16-bit query)
-    Arseny Kapoulkine                      Simon Breuss (16-bit PNM)
-    John-Mark Allen
-    Carmelo J Fdez-Aguera
-
- Bug & warning fixes
-    Marc LeBlanc            David Woo          Guillaume George     Martins Mozeiko
-    Christpher Lloyd        Jerry Jansson      Joseph Thomson       Blazej Dariusz Roszkowski
-    Phil Jordan                                Dave Moore           Roy Eltham
-    Hayaki Saito            Nathan Reed        Won Chun
-    Luke Graham             Johan Duparc       Nick Verigakis       the Horde3D community
-    Thomas Ruf              Ronny Chevalier                         github:rlyeh
-    Janez Zemva             John Bartholomew   Michal Cichon        github:romigrou
-    Jonathan Blow           Ken Hamada         Tero Hanninen        github:svdijk
-    Eugene Golushkov        Laurent Gomila     Cort Stratton        github:snagar
-    Aruelien Pocheville     Sergio Gonzalez    Thibault Reuille     github:Zelex
-    Cass Everitt            Ryamond Barbiero                        github:grim210
-    Paul Du Bois            Engin Manap        Aldo Culquicondor    github:sammyhw
-    Philipp Wiesemann       Dale Weiler        Oriol Ferrer Mesia   github:phprus
-    Josh Tobin              Neil Bickford      Matthew Gregan       github:poppolopoppo
-    Julian Raschke          Gregory Mullen     Christian Floisand   github:darealshinji
-    Baldur Karlsson         Kevin Schmidt      JR Smith             github:Michaelangel007
-                            Brad Weinberger    Matvey Cherevko      github:mosra
-    Luca Sas                Alexander Veselov  Zack Middleton       [reserved]
-    Ryan C. Gordon          [reserved]                              [reserved]
-                     DO NOT ADD YOUR NAME HERE
-
-                     Jacko Dirks
-
-  To add your name to the credits, pick a random blank space in the middle and fill it.
-  80% of merge conflicts on stb PRs are due to people adding their name at the end
-  of the credits.
-*/
-
-#ifndef STBI_INCLUDE_STB_IMAGE_H
-#define STBI_INCLUDE_STB_IMAGE_H
-
-// DOCUMENTATION
-//
-// Limitations:
-//    - no 12-bit-per-channel JPEG
-//    - no JPEGs with arithmetic coding
-//    - GIF always returns *comp=4
-//
-// Basic usage (see HDR discussion below for HDR usage):
-//    int x,y,n;
-//    unsigned char *data = stbi_load(filename, &x, &y, &n, 0);
-//    // ... process data if not NULL ...
-//    // ... x = width, y = height, n = # 8-bit components per pixel ...
-//    // ... replace '0' with '1'..'4' to force that many components per pixel
-//    // ... but 'n' will always be the number that it would have been if you said 0
-//    stbi_image_free(data);
-//
-// Standard parameters:
-//    int *x                 -- outputs image width in pixels
-//    int *y                 -- outputs image height in pixels
-//    int *channels_in_file  -- outputs # of image components in image file
-//    int desired_channels   -- if non-zero, # of image components requested in result
-//
-// The return value from an image loader is an 'unsigned char *' which points
-// to the pixel data, or NULL on an allocation failure or if the image is
-// corrupt or invalid. The pixel data consists of *y scanlines of *x pixels,
-// with each pixel consisting of N interleaved 8-bit components; the first
-// pixel pointed to is top-left-most in the image. There is no padding between
-// image scanlines or between pixels, regardless of format. The number of
-// components N is 'desired_channels' if desired_channels is non-zero, or
-// *channels_in_file otherwise. If desired_channels is non-zero,
-// *channels_in_file has the number of components that _would_ have been
-// output otherwise. E.g. if you set desired_channels to 4, you will always
-// get RGBA output, but you can check *channels_in_file to see if it's trivially
-// opaque because e.g. there were only 3 channels in the source image.
-//
-// An output image with N components has the following components interleaved
-// in this order in each pixel:
-//
-//     N=#comp     components
-//       1           grey
-//       2           grey, alpha
-//       3           red, green, blue
-//       4           red, green, blue, alpha
-//
-// If image loading fails for any reason, the return value will be NULL,
-// and *x, *y, *channels_in_file will be unchanged. The function
-// stbi_failure_reason() can be queried for an extremely brief, end-user
-// unfriendly explanation of why the load failed. Define STBI_NO_FAILURE_STRINGS
-// to avoid compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
-// more user-friendly ones.
-//
-// Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
-//
-// To query the width, height and component count of an image without having to
-// decode the full file, you can use the stbi_info family of functions:
-//
-//   int x,y,n,ok;
-//   ok = stbi_info(filename, &x, &y, &n);
-//   // returns ok=1 and sets x, y, n if image is a supported format,
-//   // 0 otherwise.
-//
-// Note that stb_image pervasively uses ints in its public API for sizes,
-// including sizes of memory buffers. This is now part of the API and thus
-// hard to change without causing breakage. As a result, the various image
-// loaders all have certain limits on image size; these differ somewhat
-// by format but generally boil down to either just under 2GB or just under
-// 1GB. When the decoded image would be larger than this, stb_image decoding
-// will fail.
-//
-// Additionally, stb_image will reject image files that have any of their
-// dimensions set to a larger value than the configurable STBI_MAX_DIMENSIONS,
-// which defaults to 2**24 = 16777216 pixels. Due to the above memory limit,
-// the only way to have an image with such dimensions load correctly
-// is for it to have a rather extreme aspect ratio. Either way, the
-// assumption here is that such larger images are likely to be malformed
-// or malicious. If you do need to load an image with individual dimensions
-// larger than that, and it still fits in the overall size limit, you can
-// #define STBI_MAX_DIMENSIONS on your own to be something larger.
-//
-// ===========================================================================
-//
-// UNICODE:
-//
-//   If compiling for Windows and you wish to use Unicode filenames, compile
-//   with
-//       #define STBI_WINDOWS_UTF8
-//   and pass utf8-encoded filenames. Call stbi_convert_wchar_to_utf8 to convert
-//   Windows wchar_t filenames to utf8.
-//
-// ===========================================================================
-//
-// Philosophy
-//
-// stb libraries are designed with the following priorities:
-//
-//    1. easy to use
-//    2. easy to maintain
-//    3. good performance
-//
-// Sometimes I let "good performance" creep up in priority over "easy to maintain",
-// and for best performance I may provide less-easy-to-use APIs that give higher
-// performance, in addition to the easy-to-use ones. Nevertheless, it's important
-// to keep in mind that from the standpoint of you, a client of this library,
-// all you care about is #1 and #3, and stb libraries DO NOT emphasize #3 above all.
-//
-// Some secondary priorities arise directly from the first two, some of which
-// provide more explicit reasons why performance can't be emphasized.
-//
-//    - Portable ("ease of use")
-//    - Small source code footprint ("easy to maintain")
-//    - No dependencies ("ease of use")
-//
-// ===========================================================================
-//
-// I/O callbacks
-//
-// I/O callbacks allow you to read from arbitrary sources, like packaged
-// files or some other source. Data read from callbacks are processed
-// through a small internal buffer (currently 128 bytes) to try to reduce
-// overhead.
-//
-// The three functions you must define are "read" (reads some bytes of data),
-// "skip" (skips some bytes of data), "eof" (reports if the stream is at the end).
-//
-// ===========================================================================
-//
-// SIMD support
-//
-// The JPEG decoder will try to automatically use SIMD kernels on x86 when
-// supported by the compiler. For ARM Neon support, you must explicitly
-// request it.
-//
-// (The old do-it-yourself SIMD API is no longer supported in the current
-// code.)
-//
-// On x86, SSE2 will automatically be used when available based on a run-time
-// test; if not, the generic C versions are used as a fall-back. On ARM targets,
-// the typical path is to have separate builds for NEON and non-NEON devices
-// (at least this is true for iOS and Android). Therefore, the NEON support is
-// toggled by a build flag: define STBI_NEON to get NEON loops.
-//
-// If for some reason you do not want to use any of SIMD code, or if
-// you have issues compiling it, you can disable it entirely by
-// defining STBI_NO_SIMD.
-//
-// ===========================================================================
-//
-// HDR image support   (disable by defining STBI_NO_HDR)
-//
-// stb_image supports loading HDR images in general, and currently the Radiance
-// .HDR file format specifically. You can still load any file through the existing
-// interface; if you attempt to load an HDR file, it will be automatically remapped
-// to LDR, assuming gamma 2.2 and an arbitrary scale factor defaulting to 1;
-// both of these constants can be reconfigured through this interface:
-//
-//     stbi_hdr_to_ldr_gamma(2.2f);
-//     stbi_hdr_to_ldr_scale(1.0f);
-//
-// (note, do not use _inverse_ constants; stbi_image will invert them
-// appropriately).
-//
-// Additionally, there is a new, parallel interface for loading files as
-// (linear) floats to preserve the full dynamic range:
-//
-//    float *data = stbi_loadf(filename, &x, &y, &n, 0);
-//
-// If you load LDR images through this interface, those images will
-// be promoted to floating point values, run through the inverse of
-// constants corresponding to the above:
-//
-//     stbi_ldr_to_hdr_scale(1.0f);
-//     stbi_ldr_to_hdr_gamma(2.2f);
-//
-// Finally, given a filename (or an open file or memory block--see header
-// file for details) containing image data, you can query for the "most
-// appropriate" interface to use (that is, whether the image is HDR or
-// not), using:
-//
-//     stbi_is_hdr(char *filename);
-//
-// ===========================================================================
-//
-// iPhone PNG support:
-//
-// We optionally support converting iPhone-formatted PNGs (which store
-// premultiplied BGRA) back to RGB, even though they're internally encoded
-// differently. To enable this conversion, call
-// stbi_convert_iphone_png_to_rgb(1).
-//
-// Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
-// pixel to remove any premultiplied alpha *only* if the image file explicitly
-// says there's premultiplied data (currently only happens in iPhone images,
-// and only if iPhone convert-to-rgb processing is on).
-//
-// ===========================================================================
-//
-// ADDITIONAL CONFIGURATION
-//
-//  - You can suppress implementation of any of the decoders to reduce
-//    your code footprint by #defining one or more of the following
-//    symbols before creating the implementation.
-//
-//        STBI_NO_JPEG
-//        STBI_NO_PNG
-//        STBI_NO_BMP
-//        STBI_NO_PSD
-//        STBI_NO_TGA
-//        STBI_NO_GIF
-//        STBI_NO_HDR
-//        STBI_NO_PIC
-//        STBI_NO_PNM   (.ppm and .pgm)
-//
-//  - You can request *only* certain decoders and suppress all other ones
-//    (this will be more forward-compatible, as addition of new decoders
-//    doesn't require you to disable them explicitly):
-//
-//        STBI_ONLY_JPEG
-//        STBI_ONLY_PNG
-//        STBI_ONLY_BMP
-//        STBI_ONLY_PSD
-//        STBI_ONLY_TGA
-//        STBI_ONLY_GIF
-//        STBI_ONLY_HDR
-//        STBI_ONLY_PIC
-//        STBI_ONLY_PNM   (.ppm and .pgm)
-//
-//   - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
-//     want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
-//
-//  - If you define STBI_MAX_DIMENSIONS, stb_image will reject images greater
-//    than that size (in either width or height) without further processing.
-//    This is to let programs in the wild set an upper bound to prevent
-//    denial-of-service attacks on untrusted data, as one could generate a
-//    valid image of gigantic dimensions and force stb_image to allocate a
-//    huge block of memory and spend disproportionate time decoding it. By
-//    default this is set to (1 << 24), which is 16777216, but that's still
-//    very big.
-
-#ifndef STBI_NO_STDIO
-#include <stdio.h>
-#endif // STBI_NO_STDIO
-
-#define STBI_VERSION 1
-
-enum
-{
-   STBI_default = 0, // only used for desired_channels
-
-   STBI_grey       = 1,
-   STBI_grey_alpha = 2,
-   STBI_rgb        = 3,
-   STBI_rgb_alpha  = 4
-};
-
-#include <stdlib.h>
-typedef unsigned char stbi_uc;
-typedef unsigned short stbi_us;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef STBIDEF
-#ifdef STB_IMAGE_STATIC
-#define STBIDEF static
-#else
-#define STBIDEF extern
-#endif
-#endif
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// PRIMARY API - works on images of any type
-//
-
-//
-// load image by filename, open file, or memory buffer
-//
-
-typedef struct
-{
-   int      (*read)  (void *user,char *data,int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read
-   void     (*skip)  (void *user,int n);                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
-   int      (*eof)   (void *user);                       // returns nonzero if we are at end of file/data
-} stbi_io_callbacks;
-
-////////////////////////////////////
-//
-// 8-bits-per-channel interface
-//
-
-STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *channels_in_file, int desired_channels);
-STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *channels_in_file, int desired_channels);
-
-#ifndef STBI_NO_STDIO
-STBIDEF stbi_uc *stbi_load            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
-STBIDEF stbi_uc *stbi_load_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
-// for stbi_load_from_file, file pointer is left pointing immediately after image
-#endif
-
-#ifndef STBI_NO_GIF
-STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
-#endif
-
-#ifdef STBI_WINDOWS_UTF8
-STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
-#endif
-
-////////////////////////////////////
-//
-// 16-bits-per-channel interface
-//
-
-STBIDEF stbi_us *stbi_load_16_from_memory   (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
-STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels);
-
-#ifndef STBI_NO_STDIO
-STBIDEF stbi_us *stbi_load_16          (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
-STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
-#endif
-
-////////////////////////////////////
-//
-// float-per-channel interface
-//
-#ifndef STBI_NO_LINEAR
-   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
-   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y,  int *channels_in_file, int desired_channels);
-
-   #ifndef STBI_NO_STDIO
-   STBIDEF float *stbi_loadf            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
-   STBIDEF float *stbi_loadf_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
-   #endif
-#endif
-
-#ifndef STBI_NO_HDR
-   STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma);
-   STBIDEF void   stbi_hdr_to_ldr_scale(float scale);
-#endif // STBI_NO_HDR
-
-#ifndef STBI_NO_LINEAR
-   STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma);
-   STBIDEF void   stbi_ldr_to_hdr_scale(float scale);
-#endif // STBI_NO_LINEAR
-
-// stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
-STBIDEF int    stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user);
-STBIDEF int    stbi_is_hdr_from_memory(stbi_uc const *buffer, int len);
-#ifndef STBI_NO_STDIO
-STBIDEF int      stbi_is_hdr          (char const *filename);
-STBIDEF int      stbi_is_hdr_from_file(FILE *f);
-#endif // STBI_NO_STDIO
-
-
-// get a VERY brief reason for failure
-// on most compilers (and ALL modern mainstream compilers) this is threadsafe
-STBIDEF const char *stbi_failure_reason  (void);
-
-// free the loaded image -- this is just free()
-STBIDEF void     stbi_image_free      (void *retval_from_stbi_load);
-
-// get image dimensions & components without fully decoding
-STBIDEF int      stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
-STBIDEF int      stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp);
-STBIDEF int      stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len);
-STBIDEF int      stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk, void *user);
-
-#ifndef STBI_NO_STDIO
-STBIDEF int      stbi_info               (char const *filename,     int *x, int *y, int *comp);
-STBIDEF int      stbi_info_from_file     (FILE *f,                  int *x, int *y, int *comp);
-STBIDEF int      stbi_is_16_bit          (char const *filename);
-STBIDEF int      stbi_is_16_bit_from_file(FILE *f);
-#endif
-
-
-
-// for image formats that explicitly notate that they have premultiplied alpha,
-// we just return the colors as stored in the file. set this flag to force
-// unpremultiplication. results are undefined if the unpremultiply overflow.
-STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply);
-
-// indicate whether we should process iphone images back to canonical format,
-// or just pass them through "as-is"
-STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert);
-
-// flip the image vertically, so the first pixel in the output array is the bottom left
-STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
-
-// as above, but only applies to images loaded on the thread that calls the function
-// this function is only available if your compiler supports thread-local variables;
-// calling it will fail to link if your compiler doesn't
-STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply);
-STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert);
-STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip);
-
-// ZLIB client - used by PNG, available for other purposes
-
-STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen);
-STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header);
-STBIDEF char *stbi_zlib_decode_malloc(const char *buffer, int len, int *outlen);
-STBIDEF int   stbi_zlib_decode_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
-
-STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len, int *outlen);
-STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
-
-
-#ifdef __cplusplus
-}
-#endif
-
-//
-//
-////   end header file   /////////////////////////////////////////////////////
-#endif // STBI_INCLUDE_STB_IMAGE_H
-
-#ifdef STB_IMAGE_IMPLEMENTATION
-
-#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) \
-  || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) \
-  || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) \
-  || defined(STBI_ONLY_ZLIB)
-   #ifndef STBI_ONLY_JPEG
-   #define STBI_NO_JPEG
-   #endif
-   #ifndef STBI_ONLY_PNG
-   #define STBI_NO_PNG
-   #endif
-   #ifndef STBI_ONLY_BMP
-   #define STBI_NO_BMP
-   #endif
-   #ifndef STBI_ONLY_PSD
-   #define STBI_NO_PSD
-   #endif
-   #ifndef STBI_ONLY_TGA
-   #define STBI_NO_TGA
-   #endif
-   #ifndef STBI_ONLY_GIF
-   #define STBI_NO_GIF
-   #endif
-   #ifndef STBI_ONLY_HDR
-   #define STBI_NO_HDR
-   #endif
-   #ifndef STBI_ONLY_PIC
-   #define STBI_NO_PIC
-   #endif
-   #ifndef STBI_ONLY_PNM
-   #define STBI_NO_PNM
-   #endif
-#endif
-
-#if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) && !defined(STBI_NO_ZLIB)
-#define STBI_NO_ZLIB
-#endif
-
-
-#include <stdarg.h>
-#include <stddef.h> // ptrdiff_t on osx
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-
-#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
-#include <math.h>  // ldexp, pow
-#endif
-
-#ifndef STBI_NO_STDIO
-#include <stdio.h>
-#endif
-
-#ifndef STBI_ASSERT
-#include <assert.h>
-#define STBI_ASSERT(x) assert(x)
-#endif
-
-#ifdef __cplusplus
-#define STBI_EXTERN extern "C"
-#else
-#define STBI_EXTERN extern
-#endif
-
-
-#ifndef _MSC_VER
-   #ifdef __cplusplus
-   #define stbi_inline inline
-   #else
-   #define stbi_inline
-   #endif
-#else
-   #define stbi_inline __forceinline
-#endif
-
-#ifndef STBI_NO_THREAD_LOCALS
-   #if defined(__cplusplus) &&  __cplusplus >= 201103L
-      #define STBI_THREAD_LOCAL       thread_local
-   #elif defined(__GNUC__) && __GNUC__ < 5
-      #define STBI_THREAD_LOCAL       __thread
-   #elif defined(_MSC_VER)
-      #define STBI_THREAD_LOCAL       __declspec(thread)
-   #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
-      #define STBI_THREAD_LOCAL       _Thread_local
-   #endif
-
-   #ifndef STBI_THREAD_LOCAL
-      #if defined(__GNUC__)
-        #define STBI_THREAD_LOCAL       __thread
-      #endif
-   #endif
-#endif
-
-#if defined(_MSC_VER) || defined(__SYMBIAN32__)
-typedef unsigned short stbi__uint16;
-typedef   signed short stbi__int16;
-typedef unsigned int   stbi__uint32;
-typedef   signed int   stbi__int32;
-#else
-#include <stdint.h>
-typedef uint16_t stbi__uint16;
-typedef int16_t  stbi__int16;
-typedef uint32_t stbi__uint32;
-typedef int32_t  stbi__int32;
-#endif
-
-// should produce compiler error if size is wrong
-typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
-
-#ifdef _MSC_VER
-#define STBI_NOTUSED(v)  (void)(v)
-#else
-#define STBI_NOTUSED(v)  (void)sizeof(v)
-#endif
-
-#ifdef _MSC_VER
-#define STBI_HAS_LROTL
-#endif
-
-#ifdef STBI_HAS_LROTL
-   #define stbi_lrot(x,y)  _lrotl(x,y)
-#else
-   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (-(y) & 31)))
-#endif
-
-#if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
-// ok
-#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
-// ok
-#else
-#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
-#endif
-
-#ifndef STBI_MALLOC
-#define STBI_MALLOC(sz)           malloc(sz)
-#define STBI_REALLOC(p,newsz)     realloc(p,newsz)
-#define STBI_FREE(p)              free(p)
-#endif
-
-#ifndef STBI_REALLOC_SIZED
-#define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz)
-#endif
-
-// x86/x64 detection
-#if defined(__x86_64__) || defined(_M_X64)
-#define STBI__X64_TARGET
-#elif defined(__i386) || defined(_M_IX86)
-#define STBI__X86_TARGET
-#endif
-
-#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
-// gcc doesn't support sse2 intrinsics unless you compile with -msse2,
-// which in turn means it gets to use SSE2 everywhere. This is unfortunate,
-// but previous attempts to provide the SSE2 functions with runtime
-// detection caused numerous issues. The way architecture extensions are
-// exposed in GCC/Clang is, sadly, not really suited for one-file libs.
-// New behavior: if compiled with -msse2, we use SSE2 without any
-// detection; if not, we don't use it at all.
-#define STBI_NO_SIMD
-#endif
-
-#if defined(__MINGW32__) && defined(STBI__X86_TARGET) && !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD)
-// Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid STBI__X64_TARGET
-//
-// 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
-// Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
-// As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
-// simultaneously enabling "-mstackrealign".
-//
-// See https://github.com/nothings/stb/issues/81 for more information.
-//
-// So default to no SSE2 on 32-bit MinGW. If you've read this far and added
-// -mstackrealign to your build settings, feel free to #define STBI_MINGW_ENABLE_SSE2.
-#define STBI_NO_SIMD
-#endif
-
-#if !defined(STBI_NO_SIMD) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
-#define STBI_SSE2
-#include <emmintrin.h>
-
-#ifdef _MSC_VER
-
-#if _MSC_VER >= 1400  // not VC6
-#include <intrin.h> // __cpuid
-static int stbi__cpuid3(void)
-{
-   int info[4];
-   __cpuid(info,1);
-   return info[3];
-}
-#else
-static int stbi__cpuid3(void)
-{
-   int res;
-   __asm {
-      mov  eax,1
-      cpuid
-      mov  res,edx
-   }
-   return res;
-}
-#endif
-
-#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
-
-#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
-static int stbi__sse2_available(void)
-{
-   int info3 = stbi__cpuid3();
-   return ((info3 >> 26) & 1) != 0;
-}
-#endif
-
-#else // assume GCC-style if not VC++
-#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
-
-#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
-static int stbi__sse2_available(void)
-{
-   // If we're even attempting to compile this on GCC/Clang, that means
-   // -msse2 is on, which means the compiler is allowed to use SSE2
-   // instructions at will, and so are we.
-   return 1;
-}
-#endif
-
-#endif
-#endif
-
-// ARM NEON
-#if defined(STBI_NO_SIMD) && defined(STBI_NEON)
-#undef STBI_NEON
-#endif
-
-#ifdef STBI_NEON
-#include <arm_neon.h>
-#ifdef _MSC_VER
-#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
-#else
-#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
-#endif
-#endif
-
-#ifndef STBI_SIMD_ALIGN
-#define STBI_SIMD_ALIGN(type, name) type name
-#endif
-
-#ifndef STBI_MAX_DIMENSIONS
-#define STBI_MAX_DIMENSIONS (1 << 24)
-#endif
-
-///////////////////////////////////////////////
-//
-//  stbi__context struct and start_xxx functions
-
-// stbi__context structure is our basic context used by all images, so it
-// contains all the IO context, plus some basic image information
-typedef struct
-{
-   stbi__uint32 img_x, img_y;
-   int img_n, img_out_n;
-
-   stbi_io_callbacks io;
-   void *io_user_data;
-
-   int read_from_callbacks;
-   int buflen;
-   stbi_uc buffer_start[128];
-   int callback_already_read;
-
-   stbi_uc *img_buffer, *img_buffer_end;
-   stbi_uc *img_buffer_original, *img_buffer_original_end;
-} stbi__context;
-
-
-static void stbi__refill_buffer(stbi__context *s);
-
-// initialize a memory-decode context
-static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
-{
-   s->io.read = NULL;
-   s->read_from_callbacks = 0;
-   s->callback_already_read = 0;
-   s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
-   s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
-}
-
-// initialize a callback-based context
-static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *user)
-{
-   s->io = *c;
-   s->io_user_data = user;
-   s->buflen = sizeof(s->buffer_start);
-   s->read_from_callbacks = 1;
-   s->callback_already_read = 0;
-   s->img_buffer = s->img_buffer_original = s->buffer_start;
-   stbi__refill_buffer(s);
-   s->img_buffer_original_end = s->img_buffer_end;
-}
-
-#ifndef STBI_NO_STDIO
-
-static int stbi__stdio_read(void *user, char *data, int size)
-{
-   return (int) fread(data,1,size,(FILE*) user);
-}
-
-static void stbi__stdio_skip(void *user, int n)
-{
-   int ch;
-   fseek((FILE*) user, n, SEEK_CUR);
-   ch = fgetc((FILE*) user);  /* have to read a byte to reset feof()'s flag */
-   if (ch != EOF) {
-      ungetc(ch, (FILE *) user);  /* push byte back onto stream if valid. */
-   }
-}
-
-static int stbi__stdio_eof(void *user)
-{
-   return feof((FILE*) user) || ferror((FILE *) user);
-}
-
-static stbi_io_callbacks stbi__stdio_callbacks =
-{
-   stbi__stdio_read,
-   stbi__stdio_skip,
-   stbi__stdio_eof,
-};
-
-static void stbi__start_file(stbi__context *s, FILE *f)
-{
-   stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *) f);
-}
-
-//static void stop_file(stbi__context *s) { }
-
-#endif // !STBI_NO_STDIO
-
-static void stbi__rewind(stbi__context *s)
-{
-   // conceptually rewind SHOULD rewind to the beginning of the stream,
-   // but we just rewind to the beginning of the initial buffer, because
-   // we only use it after doing 'test', which only ever looks at at most 92 bytes
-   s->img_buffer = s->img_buffer_original;
-   s->img_buffer_end = s->img_buffer_original_end;
-}
-
-enum
-{
-   STBI_ORDER_RGB,
-   STBI_ORDER_BGR
-};
-
-typedef struct
-{
-   int bits_per_channel;
-   int num_channels;
-   int channel_order;
-} stbi__result_info;
-
-#ifndef STBI_NO_JPEG
-static int      stbi__jpeg_test(stbi__context *s);
-static void    *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
-#endif
-
-#ifndef STBI_NO_PNG
-static int      stbi__png_test(stbi__context *s);
-static void    *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
-static int      stbi__png_is16(stbi__context *s);
-#endif
-
-#ifndef STBI_NO_BMP
-static int      stbi__bmp_test(stbi__context *s);
-static void    *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
-#endif
-
-#ifndef STBI_NO_TGA
-static int      stbi__tga_test(stbi__context *s);
-static void    *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
-#endif
-
-#ifndef STBI_NO_PSD
-static int      stbi__psd_test(stbi__context *s);
-static void    *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc);
-static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
-static int      stbi__psd_is16(stbi__context *s);
-#endif
-
-#ifndef STBI_NO_HDR
-static int      stbi__hdr_test(stbi__context *s);
-static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
-#endif
-
-#ifndef STBI_NO_PIC
-static int      stbi__pic_test(stbi__context *s);
-static void    *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
-#endif
-
-#ifndef STBI_NO_GIF
-static int      stbi__gif_test(stbi__context *s);
-static void    *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static void    *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
-static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
-#endif
-
-#ifndef STBI_NO_PNM
-static int      stbi__pnm_test(stbi__context *s);
-static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
-static int      stbi__pnm_is16(stbi__context *s);
-#endif
-
-static
-#ifdef STBI_THREAD_LOCAL
-STBI_THREAD_LOCAL
-#endif
-const char *stbi__g_failure_reason;
-
-STBIDEF const char *stbi_failure_reason(void)
-{
-   return stbi__g_failure_reason;
-}
-
-#ifndef STBI_NO_FAILURE_STRINGS
-static int stbi__err(const char *str)
-{
-   stbi__g_failure_reason = str;
-   return 0;
-}
-#endif
-
-static void *stbi__malloc(size_t size)
-{
-    return STBI_MALLOC(size);
-}
-
-// stb_image uses ints pervasively, including for offset calculations.
-// therefore the largest decoded image size we can support with the
-// current code, even on 64-bit targets, is INT_MAX. this is not a
-// significant limitation for the intended use case.
-//
-// we do, however, need to make sure our size calculations don't
-// overflow. hence a few helper functions for size calculations that
-// multiply integers together, making sure that they're non-negative
-// and no overflow occurs.
-
-// return 1 if the sum is valid, 0 on overflow.
-// negative terms are considered invalid.
-static int stbi__addsizes_valid(int a, int b)
-{
-   if (b < 0) return 0;
-   // now 0 <= b <= INT_MAX, hence also
-   // 0 <= INT_MAX - b <= INTMAX.
-   // And "a + b <= INT_MAX" (which might overflow) is the
-   // same as a <= INT_MAX - b (no overflow)
-   return a <= INT_MAX - b;
-}
-
-// returns 1 if the product is valid, 0 on overflow.
-// negative factors are considered invalid.
-static int stbi__mul2sizes_valid(int a, int b)
-{
-   if (a < 0 || b < 0) return 0;
-   if (b == 0) return 1; // mul-by-0 is always safe
-   // portable way to check for no overflows in a*b
-   return a <= INT_MAX/b;
-}
-
-#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
-// returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
-static int stbi__mad2sizes_valid(int a, int b, int add)
-{
-   return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add);
-}
-#endif
-
-// returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
-static int stbi__mad3sizes_valid(int a, int b, int c, int add)
-{
-   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
-      stbi__addsizes_valid(a*b*c, add);
-}
-
-// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
-#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
-static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
-{
-   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
-      stbi__mul2sizes_valid(a*b*c, d) && stbi__addsizes_valid(a*b*c*d, add);
-}
-#endif
-
-#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
-// mallocs with size overflow checking
-static void *stbi__malloc_mad2(int a, int b, int add)
-{
-   if (!stbi__mad2sizes_valid(a, b, add)) return NULL;
-   return stbi__malloc(a*b + add);
-}
-#endif
-
-static void *stbi__malloc_mad3(int a, int b, int c, int add)
-{
-   if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL;
-   return stbi__malloc(a*b*c + add);
-}
-
-#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
-static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
-{
-   if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
-   return stbi__malloc(a*b*c*d + add);
-}
-#endif
-
-// returns 1 if the sum of two signed ints is valid (between -2^31 and 2^31-1 inclusive), 0 on overflow.
-static int stbi__addints_valid(int a, int b)
-{
-   if ((a >= 0) != (b >= 0)) return 1; // a and b have different signs, so no overflow
-   if (a < 0 && b < 0) return a >= INT_MIN - b; // same as a + b >= INT_MIN; INT_MIN - b cannot overflow since b < 0.
-   return a <= INT_MAX - b;
-}
-
-// returns 1 if the product of two signed shorts is valid, 0 on overflow.
-static int stbi__mul2shorts_valid(short a, short b)
-{
-   if (b == 0 || b == -1) return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow
-   if ((a >= 0) == (b >= 0)) return a <= SHRT_MAX/b; // product is positive, so similar to mul2sizes_valid
-   if (b < 0) return a <= SHRT_MIN / b; // same as a * b >= SHRT_MIN
-   return a >= SHRT_MIN / b;
-}
-
-// stbi__err - error
-// stbi__errpf - error returning pointer to float
-// stbi__errpuc - error returning pointer to unsigned char
-
-#ifdef STBI_NO_FAILURE_STRINGS
-   #define stbi__err(x,y)  0
-#elif defined(STBI_FAILURE_USERMSG)
-   #define stbi__err(x,y)  stbi__err(y)
-#else
-   #define stbi__err(x,y)  stbi__err(x)
-#endif
-
-#define stbi__errpf(x,y)   ((float *)(size_t) (stbi__err(x,y)?NULL:NULL))
-#define stbi__errpuc(x,y)  ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL))
-
-STBIDEF void stbi_image_free(void *retval_from_stbi_load)
-{
-   STBI_FREE(retval_from_stbi_load);
-}
-
-#ifndef STBI_NO_LINEAR
-static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
-#endif
-
-#ifndef STBI_NO_HDR
-static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp);
-#endif
-
-static int stbi__vertically_flip_on_load_global = 0;
-
-STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
-{
-   stbi__vertically_flip_on_load_global = flag_true_if_should_flip;
-}
-
-#ifndef STBI_THREAD_LOCAL
-#define stbi__vertically_flip_on_load  stbi__vertically_flip_on_load_global
-#else
-static STBI_THREAD_LOCAL int stbi__vertically_flip_on_load_local, stbi__vertically_flip_on_load_set;
-
-STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip)
-{
-   stbi__vertically_flip_on_load_local = flag_true_if_should_flip;
-   stbi__vertically_flip_on_load_set = 1;
-}
-
-#define stbi__vertically_flip_on_load  (stbi__vertically_flip_on_load_set       \
-                                         ? stbi__vertically_flip_on_load_local  \
-                                         : stbi__vertically_flip_on_load_global)
-#endif // STBI_THREAD_LOCAL
-
-static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
-{
-   memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
-   ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed
-   ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
-   ri->num_channels = 0;
-
-   // test the formats with a very explicit header first (at least a FOURCC
-   // or distinctive magic number first)
-   #ifndef STBI_NO_PNG
-   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
-   #endif
-   #ifndef STBI_NO_BMP
-   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp, ri);
-   #endif
-   #ifndef STBI_NO_GIF
-   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp, ri);
-   #endif
-   #ifndef STBI_NO_PSD
-   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc);
-   #else
-   STBI_NOTUSED(bpc);
-   #endif
-   #ifndef STBI_NO_PIC
-   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
-   #endif
-
-   // then the formats that can end up attempting to load with just 1 or 2
-   // bytes matching expectations; these are prone to false positives, so
-   // try them later
-   #ifndef STBI_NO_JPEG
-   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
-   #endif
-   #ifndef STBI_NO_PNM
-   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
-   #endif
-
-   #ifndef STBI_NO_HDR
-   if (stbi__hdr_test(s)) {
-      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp, ri);
-      return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
-   }
-   #endif
-
-   #ifndef STBI_NO_TGA
-   // test tga last because it's a crappy test!
-   if (stbi__tga_test(s))
-      return stbi__tga_load(s,x,y,comp,req_comp, ri);
-   #endif
-
-   return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
-}
-
-static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, int channels)
-{
-   int i;
-   int img_len = w * h * channels;
-   stbi_uc *reduced;
-
-   reduced = (stbi_uc *) stbi__malloc(img_len);
-   if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory");
-
-   for (i = 0; i < img_len; ++i)
-      reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
-
-   STBI_FREE(orig);
-   return reduced;
-}
-
-static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int channels)
-{
-   int i;
-   int img_len = w * h * channels;
-   stbi__uint16 *enlarged;
-
-   enlarged = (stbi__uint16 *) stbi__malloc(img_len*2);
-   if (enlarged == NULL) return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
-
-   for (i = 0; i < img_len; ++i)
-      enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
-
-   STBI_FREE(orig);
-   return enlarged;
-}
-
-static void stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel)
-{
-   int row;
-   size_t bytes_per_row = (size_t)w * bytes_per_pixel;
-   stbi_uc temp[2048];
-   stbi_uc *bytes = (stbi_uc *)image;
-
-   for (row = 0; row < (h>>1); row++) {
-      stbi_uc *row0 = bytes + row*bytes_per_row;
-      stbi_uc *row1 = bytes + (h - row - 1)*bytes_per_row;
-      // swap row0 with row1
-      size_t bytes_left = bytes_per_row;
-      while (bytes_left) {
-         size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
-         memcpy(temp, row0, bytes_copy);
-         memcpy(row0, row1, bytes_copy);
-         memcpy(row1, temp, bytes_copy);
-         row0 += bytes_copy;
-         row1 += bytes_copy;
-         bytes_left -= bytes_copy;
-      }
-   }
-}
-
-#ifndef STBI_NO_GIF
-static void stbi__vertical_flip_slices(void *image, int w, int h, int z, int bytes_per_pixel)
-{
-   int slice;
-   int slice_size = w * h * bytes_per_pixel;
-
-   stbi_uc *bytes = (stbi_uc *)image;
-   for (slice = 0; slice < z; ++slice) {
-      stbi__vertical_flip(bytes, w, h, bytes_per_pixel);
-      bytes += slice_size;
-   }
-}
-#endif
-
-static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__result_info ri;
-   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
-
-   if (result == NULL)
-      return NULL;
-
-   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
-   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
-
-   if (ri.bits_per_channel != 8) {
-      result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
-      ri.bits_per_channel = 8;
-   }
-
-   // @TODO: move stbi__convert_format to here
-
-   if (stbi__vertically_flip_on_load) {
-      int channels = req_comp ? req_comp : *comp;
-      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
-   }
-
-   return (unsigned char *) result;
-}
-
-static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__result_info ri;
-   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
-
-   if (result == NULL)
-      return NULL;
-
-   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
-   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
-
-   if (ri.bits_per_channel != 16) {
-      result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
-      ri.bits_per_channel = 16;
-   }
-
-   // @TODO: move stbi__convert_format16 to here
-   // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
-
-   if (stbi__vertically_flip_on_load) {
-      int channels = req_comp ? req_comp : *comp;
-      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
-   }
-
-   return (stbi__uint16 *) result;
-}
-
-#if !defined(STBI_NO_HDR) && !defined(STBI_NO_LINEAR)
-static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
-{
-   if (stbi__vertically_flip_on_load && result != NULL) {
-      int channels = req_comp ? req_comp : *comp;
-      stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
-   }
-}
-#endif
-
-#ifndef STBI_NO_STDIO
-
-#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
-STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
-STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
-#endif
-
-#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
-STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
-{
-	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
-}
-#endif
-
-static FILE *stbi__fopen(char const *filename, char const *mode)
-{
-   FILE *f;
-#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
-   wchar_t wMode[64];
-   wchar_t wFilename[1024];
-	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
-      return 0;
-
-	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
-      return 0;
-
-#if defined(_MSC_VER) && _MSC_VER >= 1400
-	if (0 != _wfopen_s(&f, wFilename, wMode))
-		f = 0;
-#else
-   f = _wfopen(wFilename, wMode);
-#endif
-
-#elif defined(_MSC_VER) && _MSC_VER >= 1400
-   if (0 != fopen_s(&f, filename, mode))
-      f=0;
-#else
-   f = fopen(filename, mode);
-#endif
-   return f;
-}
-
-
-STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
-{
-   FILE *f = stbi__fopen(filename, "rb");
-   unsigned char *result;
-   if (!f) return stbi__errpuc("can't fopen", "Unable to open file");
-   result = stbi_load_from_file(f,x,y,comp,req_comp);
-   fclose(f);
-   return result;
-}
-
-STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
-{
-   unsigned char *result;
-   stbi__context s;
-   stbi__start_file(&s,f);
-   result = stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
-   if (result) {
-      // need to 'unget' all the characters in the IO buffer
-      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
-   }
-   return result;
-}
-
-STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__uint16 *result;
-   stbi__context s;
-   stbi__start_file(&s,f);
-   result = stbi__load_and_postprocess_16bit(&s,x,y,comp,req_comp);
-   if (result) {
-      // need to 'unget' all the characters in the IO buffer
-      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
-   }
-   return result;
-}
-
-STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp, int req_comp)
-{
-   FILE *f = stbi__fopen(filename, "rb");
-   stbi__uint16 *result;
-   if (!f) return (stbi_us *) stbi__errpuc("can't fopen", "Unable to open file");
-   result = stbi_load_from_file_16(f,x,y,comp,req_comp);
-   fclose(f);
-   return result;
-}
-
-
-#endif //!STBI_NO_STDIO
-
-STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels)
-{
-   stbi__context s;
-   stbi__start_mem(&s,buffer,len);
-   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
-}
-
-STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels)
-{
-   stbi__context s;
-   stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
-   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
-}
-
-STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__context s;
-   stbi__start_mem(&s,buffer,len);
-   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
-}
-
-STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__context s;
-   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
-   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
-}
-
-#ifndef STBI_NO_GIF
-STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
-{
-   unsigned char *result;
-   stbi__context s;
-   stbi__start_mem(&s,buffer,len);
-
-   result = (unsigned char*) stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
-   if (stbi__vertically_flip_on_load) {
-      stbi__vertical_flip_slices( result, *x, *y, *z, *comp );
-   }
-
-   return result;
-}
-#endif
-
-#ifndef STBI_NO_LINEAR
-static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
-{
-   unsigned char *data;
-   #ifndef STBI_NO_HDR
-   if (stbi__hdr_test(s)) {
-      stbi__result_info ri;
-      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp, &ri);
-      if (hdr_data)
-         stbi__float_postprocess(hdr_data,x,y,comp,req_comp);
-      return hdr_data;
-   }
-   #endif
-   data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
-   if (data)
-      return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
-   return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
-}
-
-STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__context s;
-   stbi__start_mem(&s,buffer,len);
-   return stbi__loadf_main(&s,x,y,comp,req_comp);
-}
-
-STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__context s;
-   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
-   return stbi__loadf_main(&s,x,y,comp,req_comp);
-}
-
-#ifndef STBI_NO_STDIO
-STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
-{
-   float *result;
-   FILE *f = stbi__fopen(filename, "rb");
-   if (!f) return stbi__errpf("can't fopen", "Unable to open file");
-   result = stbi_loadf_from_file(f,x,y,comp,req_comp);
-   fclose(f);
-   return result;
-}
-
-STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__context s;
-   stbi__start_file(&s,f);
-   return stbi__loadf_main(&s,x,y,comp,req_comp);
-}
-#endif // !STBI_NO_STDIO
-
-#endif // !STBI_NO_LINEAR
-
-// these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is
-// defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
-// reports false!
-
-STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len)
-{
-   #ifndef STBI_NO_HDR
-   stbi__context s;
-   stbi__start_mem(&s,buffer,len);
-   return stbi__hdr_test(&s);
-   #else
-   STBI_NOTUSED(buffer);
-   STBI_NOTUSED(len);
-   return 0;
-   #endif
-}
-
-#ifndef STBI_NO_STDIO
-STBIDEF int      stbi_is_hdr          (char const *filename)
-{
-   FILE *f = stbi__fopen(filename, "rb");
-   int result=0;
-   if (f) {
-      result = stbi_is_hdr_from_file(f);
-      fclose(f);
-   }
-   return result;
-}
-
-STBIDEF int stbi_is_hdr_from_file(FILE *f)
-{
-   #ifndef STBI_NO_HDR
-   long pos = ftell(f);
-   int res;
-   stbi__context s;
-   stbi__start_file(&s,f);
-   res = stbi__hdr_test(&s);
-   fseek(f, pos, SEEK_SET);
-   return res;
-   #else
-   STBI_NOTUSED(f);
-   return 0;
-   #endif
-}
-#endif // !STBI_NO_STDIO
-
-STBIDEF int      stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user)
-{
-   #ifndef STBI_NO_HDR
-   stbi__context s;
-   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
-   return stbi__hdr_test(&s);
-   #else
-   STBI_NOTUSED(clbk);
-   STBI_NOTUSED(user);
-   return 0;
-   #endif
-}
-
-#ifndef STBI_NO_LINEAR
-static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f;
-
-STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
-STBIDEF void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
-#endif
-
-static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
-
-STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
-STBIDEF void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
-
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// Common code used by all image loaders
-//
-
-enum
-{
-   STBI__SCAN_load=0,
-   STBI__SCAN_type,
-   STBI__SCAN_header
-};
-
-static void stbi__refill_buffer(stbi__context *s)
-{
-   int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
-   s->callback_already_read += (int) (s->img_buffer - s->img_buffer_original);
-   if (n == 0) {
-      // at end of file, treat same as if from memory, but need to handle case
-      // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
-      s->read_from_callbacks = 0;
-      s->img_buffer = s->buffer_start;
-      s->img_buffer_end = s->buffer_start+1;
-      *s->img_buffer = 0;
-   } else {
-      s->img_buffer = s->buffer_start;
-      s->img_buffer_end = s->buffer_start + n;
-   }
-}
-
-stbi_inline static stbi_uc stbi__get8(stbi__context *s)
-{
-   if (s->img_buffer < s->img_buffer_end)
-      return *s->img_buffer++;
-   if (s->read_from_callbacks) {
-      stbi__refill_buffer(s);
-      return *s->img_buffer++;
-   }
-   return 0;
-}
-
-#if defined(STBI_NO_JPEG) && defined(STBI_NO_HDR) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
-// nothing
-#else
-stbi_inline static int stbi__at_eof(stbi__context *s)
-{
-   if (s->io.read) {
-      if (!(s->io.eof)(s->io_user_data)) return 0;
-      // if feof() is true, check if buffer = end
-      // special case: we've only got the special 0 character at the end
-      if (s->read_from_callbacks == 0) return 1;
-   }
-
-   return s->img_buffer >= s->img_buffer_end;
-}
-#endif
-
-#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC)
-// nothing
-#else
-static void stbi__skip(stbi__context *s, int n)
-{
-   if (n == 0) return;  // already there!
-   if (n < 0) {
-      s->img_buffer = s->img_buffer_end;
-      return;
-   }
-   if (s->io.read) {
-      int blen = (int) (s->img_buffer_end - s->img_buffer);
-      if (blen < n) {
-         s->img_buffer = s->img_buffer_end;
-         (s->io.skip)(s->io_user_data, n - blen);
-         return;
-      }
-   }
-   s->img_buffer += n;
-}
-#endif
-
-#if defined(STBI_NO_PNG) && defined(STBI_NO_TGA) && defined(STBI_NO_HDR) && defined(STBI_NO_PNM)
-// nothing
-#else
-static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
-{
-   if (s->io.read) {
-      int blen = (int) (s->img_buffer_end - s->img_buffer);
-      if (blen < n) {
-         int res, count;
-
-         memcpy(buffer, s->img_buffer, blen);
-
-         count = (s->io.read)(s->io_user_data, (char*) buffer + blen, n - blen);
-         res = (count == (n-blen));
-         s->img_buffer = s->img_buffer_end;
-         return res;
-      }
-   }
-
-   if (s->img_buffer+n <= s->img_buffer_end) {
-      memcpy(buffer, s->img_buffer, n);
-      s->img_buffer += n;
-      return 1;
-   } else
-      return 0;
-}
-#endif
-
-#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
-// nothing
-#else
-static int stbi__get16be(stbi__context *s)
-{
-   int z = stbi__get8(s);
-   return (z << 8) + stbi__get8(s);
-}
-#endif
-
-#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
-// nothing
-#else
-static stbi__uint32 stbi__get32be(stbi__context *s)
-{
-   stbi__uint32 z = stbi__get16be(s);
-   return (z << 16) + stbi__get16be(s);
-}
-#endif
-
-#if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
-// nothing
-#else
-static int stbi__get16le(stbi__context *s)
-{
-   int z = stbi__get8(s);
-   return z + (stbi__get8(s) << 8);
-}
-#endif
-
-#ifndef STBI_NO_BMP
-static stbi__uint32 stbi__get32le(stbi__context *s)
-{
-   stbi__uint32 z = stbi__get16le(s);
-   z += (stbi__uint32)stbi__get16le(s) << 16;
-   return z;
-}
-#endif
-
-#define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
-
-#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
-// nothing
-#else
-//////////////////////////////////////////////////////////////////////////////
-//
-//  generic converter from built-in img_n to req_comp
-//    individual types do this automatically as much as possible (e.g. jpeg
-//    does all cases internally since it needs to colorspace convert anyway,
-//    and it never has alpha, so very few cases ). png can automatically
-//    interleave an alpha=255 channel, but falls back to this for other cases
-//
-//  assume data buffer is malloced, so malloc a new one and free that one
-//  only failure mode is malloc failing
-
-static stbi_uc stbi__compute_y(int r, int g, int b)
-{
-   return (stbi_uc) (((r*77) + (g*150) +  (29*b)) >> 8);
-}
-#endif
-
-#if defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
-// nothing
-#else
-static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y)
-{
-   int i,j;
-   unsigned char *good;
-
-   if (req_comp == img_n) return data;
-   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
-
-   good = (unsigned char *) stbi__malloc_mad3(req_comp, x, y, 0);
-   if (good == NULL) {
-      STBI_FREE(data);
-      return stbi__errpuc("outofmem", "Out of memory");
-   }
-
-   for (j=0; j < (int) y; ++j) {
-      unsigned char *src  = data + j * x * img_n   ;
-      unsigned char *dest = good + j * x * req_comp;
-
-      #define STBI__COMBO(a,b)  ((a)*8+(b))
-      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
-      // convert source image with img_n components to one with req_comp components;
-      // avoid switch per pixel, so use switch per scanline and massive macros
-      switch (STBI__COMBO(img_n, req_comp)) {
-         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=255;                                     } break;
-         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
-         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=255;                     } break;
-         STBI__CASE(2,1) { dest[0]=src[0];                                                  } break;
-         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
-         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                  } break;
-         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=255;        } break;
-         STBI__CASE(3,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
-         STBI__CASE(3,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = 255;    } break;
-         STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
-         STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = src[3]; } break;
-         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                    } break;
-         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return stbi__errpuc("unsupported", "Unsupported format conversion");
-      }
-      #undef STBI__CASE
-   }
-
-   STBI_FREE(data);
-   return good;
-}
-#endif
-
-#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
-// nothing
-#else
-static stbi__uint16 stbi__compute_y_16(int r, int g, int b)
-{
-   return (stbi__uint16) (((r*77) + (g*150) +  (29*b)) >> 8);
-}
-#endif
-
-#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
-// nothing
-#else
-static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y)
-{
-   int i,j;
-   stbi__uint16 *good;
-
-   if (req_comp == img_n) return data;
-   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
-
-   good = (stbi__uint16 *) stbi__malloc(req_comp * x * y * 2);
-   if (good == NULL) {
-      STBI_FREE(data);
-      return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
-   }
-
-   for (j=0; j < (int) y; ++j) {
-      stbi__uint16 *src  = data + j * x * img_n   ;
-      stbi__uint16 *dest = good + j * x * req_comp;
-
-      #define STBI__COMBO(a,b)  ((a)*8+(b))
-      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
-      // convert source image with img_n components to one with req_comp components;
-      // avoid switch per pixel, so use switch per scanline and massive macros
-      switch (STBI__COMBO(img_n, req_comp)) {
-         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=0xffff;                                     } break;
-         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
-         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=0xffff;                     } break;
-         STBI__CASE(2,1) { dest[0]=src[0];                                                     } break;
-         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
-         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                     } break;
-         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=0xffff;        } break;
-         STBI__CASE(3,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
-         STBI__CASE(3,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = 0xffff; } break;
-         STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
-         STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = src[3]; } break;
-         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                       } break;
-         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return (stbi__uint16*) stbi__errpuc("unsupported", "Unsupported format conversion");
-      }
-      #undef STBI__CASE
-   }
-
-   STBI_FREE(data);
-   return good;
-}
-#endif
-
-#ifndef STBI_NO_LINEAR
-static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
-{
-   int i,k,n;
-   float *output;
-   if (!data) return NULL;
-   output = (float *) stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
-   if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
-   // compute number of non-alpha components
-   if (comp & 1) n = comp; else n = comp-1;
-   for (i=0; i < x*y; ++i) {
-      for (k=0; k < n; ++k) {
-         output[i*comp + k] = (float) (pow(data[i*comp+k]/255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
-      }
-   }
-   if (n < comp) {
-      for (i=0; i < x*y; ++i) {
-         output[i*comp + n] = data[i*comp + n]/255.0f;
-      }
-   }
-   STBI_FREE(data);
-   return output;
-}
-#endif
-
-#ifndef STBI_NO_HDR
-#define stbi__float2int(x)   ((int) (x))
-static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
-{
-   int i,k,n;
-   stbi_uc *output;
-   if (!data) return NULL;
-   output = (stbi_uc *) stbi__malloc_mad3(x, y, comp, 0);
-   if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
-   // compute number of non-alpha components
-   if (comp & 1) n = comp; else n = comp-1;
-   for (i=0; i < x*y; ++i) {
-      for (k=0; k < n; ++k) {
-         float z = (float) pow(data[i*comp+k]*stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
-         if (z < 0) z = 0;
-         if (z > 255) z = 255;
-         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
-      }
-      if (k < comp) {
-         float z = data[i*comp+k] * 255 + 0.5f;
-         if (z < 0) z = 0;
-         if (z > 255) z = 255;
-         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
-      }
-   }
-   STBI_FREE(data);
-   return output;
-}
-#endif
-
-//////////////////////////////////////////////////////////////////////////////
-//
-//  "baseline" JPEG/JFIF decoder
-//
-//    simple implementation
-//      - doesn't support delayed output of y-dimension
-//      - simple interface (only one output format: 8-bit interleaved RGB)
-//      - doesn't try to recover corrupt jpegs
-//      - doesn't allow partial loading, loading multiple at once
-//      - still fast on x86 (copying globals into locals doesn't help x86)
-//      - allocates lots of intermediate memory (full size of all components)
-//        - non-interleaved case requires this anyway
-//        - allows good upsampling (see next)
-//    high-quality
-//      - upsampled channels are bilinearly interpolated, even across blocks
-//      - quality integer IDCT derived from IJG's 'slow'
-//    performance
-//      - fast huffman; reasonable integer IDCT
-//      - some SIMD kernels for common paths on targets with SSE2/NEON
-//      - uses a lot of intermediate memory, could cache poorly
-
-#ifndef STBI_NO_JPEG
-
-// huffman decoding acceleration
-#define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
-
-typedef struct
-{
-   stbi_uc  fast[1 << FAST_BITS];
-   // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
-   stbi__uint16 code[256];
-   stbi_uc  values[256];
-   stbi_uc  size[257];
-   unsigned int maxcode[18];
-   int    delta[17];   // old 'firstsymbol' - old 'firstcode'
-} stbi__huffman;
-
-typedef struct
-{
-   stbi__context *s;
-   stbi__huffman huff_dc[4];
-   stbi__huffman huff_ac[4];
-   stbi__uint16 dequant[4][64];
-   stbi__int16 fast_ac[4][1 << FAST_BITS];
-
-// sizes for components, interleaved MCUs
-   int img_h_max, img_v_max;
-   int img_mcu_x, img_mcu_y;
-   int img_mcu_w, img_mcu_h;
-
-// definition of jpeg image component
-   struct
-   {
-      int id;
-      int h,v;
-      int tq;
-      int hd,ha;
-      int dc_pred;
-
-      int x,y,w2,h2;
-      stbi_uc *data;
-      void *raw_data, *raw_coeff;
-      stbi_uc *linebuf;
-      short   *coeff;   // progressive only
-      int      coeff_w, coeff_h; // number of 8x8 coefficient blocks
-   } img_comp[4];
-
-   stbi__uint32   code_buffer; // jpeg entropy-coded buffer
-   int            code_bits;   // number of valid bits
-   unsigned char  marker;      // marker seen while filling entropy buffer
-   int            nomore;      // flag if we saw a marker so must stop
-
-   int            progressive;
-   int            spec_start;
-   int            spec_end;
-   int            succ_high;
-   int            succ_low;
-   int            eob_run;
-   int            jfif;
-   int            app14_color_transform; // Adobe APP14 tag
-   int            rgb;
-
-   int scan_n, order[4];
-   int restart_interval, todo;
-
-// kernels
-   void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
-   void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step);
-   stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs);
-} stbi__jpeg;
-
-static int stbi__build_huffman(stbi__huffman *h, int *count)
-{
-   int i,j,k=0;
-   unsigned int code;
-   // build size list for each symbol (from JPEG spec)
-   for (i=0; i < 16; ++i) {
-      for (j=0; j < count[i]; ++j) {
-         h->size[k++] = (stbi_uc) (i+1);
-         if(k >= 257) return stbi__err("bad size list","Corrupt JPEG");
-      }
-   }
-   h->size[k] = 0;
-
-   // compute actual symbols (from jpeg spec)
-   code = 0;
-   k = 0;
-   for(j=1; j <= 16; ++j) {
-      // compute delta to add to code to compute symbol id
-      h->delta[j] = k - code;
-      if (h->size[k] == j) {
-         while (h->size[k] == j)
-            h->code[k++] = (stbi__uint16) (code++);
-         if (code-1 >= (1u << j)) return stbi__err("bad code lengths","Corrupt JPEG");
-      }
-      // compute largest code + 1 for this size, preshifted as needed later
-      h->maxcode[j] = code << (16-j);
-      code <<= 1;
-   }
-   h->maxcode[j] = 0xffffffff;
-
-   // build non-spec acceleration table; 255 is flag for not-accelerated
-   memset(h->fast, 255, 1 << FAST_BITS);
-   for (i=0; i < k; ++i) {
-      int s = h->size[i];
-      if (s <= FAST_BITS) {
-         int c = h->code[i] << (FAST_BITS-s);
-         int m = 1 << (FAST_BITS-s);
-         for (j=0; j < m; ++j) {
-            h->fast[c+j] = (stbi_uc) i;
-         }
-      }
-   }
-   return 1;
-}
-
-// build a table that decodes both magnitude and value of small ACs in
-// one go.
-static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
-{
-   int i;
-   for (i=0; i < (1 << FAST_BITS); ++i) {
-      stbi_uc fast = h->fast[i];
-      fast_ac[i] = 0;
-      if (fast < 255) {
-         int rs = h->values[fast];
-         int run = (rs >> 4) & 15;
-         int magbits = rs & 15;
-         int len = h->size[fast];
-
-         if (magbits && len + magbits <= FAST_BITS) {
-            // magnitude code followed by receive_extend code
-            int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
-            int m = 1 << (magbits - 1);
-            if (k < m) k += (~0U << magbits) + 1;
-            // if the result is small enough, we can fit it in fast_ac table
-            if (k >= -128 && k <= 127)
-               fast_ac[i] = (stbi__int16) ((k * 256) + (run * 16) + (len + magbits));
-         }
-      }
-   }
-}
-
-static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
-{
-   do {
-      unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
-      if (b == 0xff) {
-         int c = stbi__get8(j->s);
-         while (c == 0xff) c = stbi__get8(j->s); // consume fill bytes
-         if (c != 0) {
-            j->marker = (unsigned char) c;
-            j->nomore = 1;
-            return;
-         }
-      }
-      j->code_buffer |= b << (24 - j->code_bits);
-      j->code_bits += 8;
-   } while (j->code_bits <= 24);
-}
-
-// (1 << n) - 1
-static const stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
-
-// decode a jpeg huffman value from the bitstream
-stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
-{
-   unsigned int temp;
-   int c,k;
-
-   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
-
-   // look at the top FAST_BITS and determine what symbol ID it is,
-   // if the code is <= FAST_BITS
-   c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
-   k = h->fast[c];
-   if (k < 255) {
-      int s = h->size[k];
-      if (s > j->code_bits)
-         return -1;
-      j->code_buffer <<= s;
-      j->code_bits -= s;
-      return h->values[k];
-   }
-
-   // naive test is to shift the code_buffer down so k bits are
-   // valid, then test against maxcode. To speed this up, we've
-   // preshifted maxcode left so that it has (16-k) 0s at the
-   // end; in other words, regardless of the number of bits, it
-   // wants to be compared against something shifted to have 16;
-   // that way we don't need to shift inside the loop.
-   temp = j->code_buffer >> 16;
-   for (k=FAST_BITS+1 ; ; ++k)
-      if (temp < h->maxcode[k])
-         break;
-   if (k == 17) {
-      // error! code not found
-      j->code_bits -= 16;
-      return -1;
-   }
-
-   if (k > j->code_bits)
-      return -1;
-
-   // convert the huffman code to the symbol id
-   c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
-   if(c < 0 || c >= 256) // symbol id out of bounds!
-       return -1;
-   STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
-
-   // convert the id to a symbol
-   j->code_bits -= k;
-   j->code_buffer <<= k;
-   return h->values[c];
-}
-
-// bias[n] = (-1<<n) + 1
-static const int stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
-
-// combined JPEG 'receive' and JPEG 'extend', since baseline
-// always extends everything it receives.
-stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
-{
-   unsigned int k;
-   int sgn;
-   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
-   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
-
-   sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative)
-   k = stbi_lrot(j->code_buffer, n);
-   j->code_buffer = k & ~stbi__bmask[n];
-   k &= stbi__bmask[n];
-   j->code_bits -= n;
-   return k + (stbi__jbias[n] & (sgn - 1));
-}
-
-// get some unsigned bits
-stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
-{
-   unsigned int k;
-   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
-   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
-   k = stbi_lrot(j->code_buffer, n);
-   j->code_buffer = k & ~stbi__bmask[n];
-   k &= stbi__bmask[n];
-   j->code_bits -= n;
-   return k;
-}
-
-stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
-{
-   unsigned int k;
-   if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
-   if (j->code_bits < 1) return 0; // ran out of bits from stream, return 0s intead of continuing
-   k = j->code_buffer;
-   j->code_buffer <<= 1;
-   --j->code_bits;
-   return k & 0x80000000;
-}
-
-// given a value that's at position X in the zigzag stream,
-// where does it appear in the 8x8 matrix coded as row-major?
-static const stbi_uc stbi__jpeg_dezigzag[64+15] =
-{
-    0,  1,  8, 16,  9,  2,  3, 10,
-   17, 24, 32, 25, 18, 11,  4,  5,
-   12, 19, 26, 33, 40, 48, 41, 34,
-   27, 20, 13,  6,  7, 14, 21, 28,
-   35, 42, 49, 56, 57, 50, 43, 36,
-   29, 22, 15, 23, 30, 37, 44, 51,
-   58, 59, 52, 45, 38, 31, 39, 46,
-   53, 60, 61, 54, 47, 55, 62, 63,
-   // let corrupt input sample past end
-   63, 63, 63, 63, 63, 63, 63, 63,
-   63, 63, 63, 63, 63, 63, 63
-};
-
-// decode one 64-entry block--
-static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi__uint16 *dequant)
-{
-   int diff,dc,k;
-   int t;
-
-   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
-   t = stbi__jpeg_huff_decode(j, hdc);
-   if (t < 0 || t > 15) return stbi__err("bad huffman code","Corrupt JPEG");
-
-   // 0 all the ac values now so we can do it 32-bits at a time
-   memset(data,0,64*sizeof(data[0]));
-
-   diff = t ? stbi__extend_receive(j, t) : 0;
-   if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta","Corrupt JPEG");
-   dc = j->img_comp[b].dc_pred + diff;
-   j->img_comp[b].dc_pred = dc;
-   if (!stbi__mul2shorts_valid(dc, dequant[0])) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
-   data[0] = (short) (dc * dequant[0]);
-
-   // decode AC components, see JPEG spec
-   k = 1;
-   do {
-      unsigned int zig;
-      int c,r,s;
-      if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
-      c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
-      r = fac[c];
-      if (r) { // fast-AC path
-         k += (r >> 4) & 15; // run
-         s = r & 15; // combined length
-         if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
-         j->code_buffer <<= s;
-         j->code_bits -= s;
-         // decode into unzigzag'd location
-         zig = stbi__jpeg_dezigzag[k++];
-         data[zig] = (short) ((r >> 8) * dequant[zig]);
-      } else {
-         int rs = stbi__jpeg_huff_decode(j, hac);
-         if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
-         s = rs & 15;
-         r = rs >> 4;
-         if (s == 0) {
-            if (rs != 0xf0) break; // end block
-            k += 16;
-         } else {
-            k += r;
-            // decode into unzigzag'd location
-            zig = stbi__jpeg_dezigzag[k++];
-            data[zig] = (short) (stbi__extend_receive(j,s) * dequant[zig]);
-         }
-      }
-   } while (k < 64);
-   return 1;
-}
-
-static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__huffman *hdc, int b)
-{
-   int diff,dc;
-   int t;
-   if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
-
-   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
-
-   if (j->succ_high == 0) {
-      // first scan for DC coefficient, must be first
-      memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
-      t = stbi__jpeg_huff_decode(j, hdc);
-      if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
-      diff = t ? stbi__extend_receive(j, t) : 0;
-
-      if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta", "Corrupt JPEG");
-      dc = j->img_comp[b].dc_pred + diff;
-      j->img_comp[b].dc_pred = dc;
-      if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low)) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
-      data[0] = (short) (dc * (1 << j->succ_low));
-   } else {
-      // refinement scan for DC coefficient
-      if (stbi__jpeg_get_bit(j))
-         data[0] += (short) (1 << j->succ_low);
-   }
-   return 1;
-}
-
-// @OPTIMIZE: store non-zigzagged during the decode passes,
-// and only de-zigzag when dequantizing
-static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__huffman *hac, stbi__int16 *fac)
-{
-   int k;
-   if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
-
-   if (j->succ_high == 0) {
-      int shift = j->succ_low;
-
-      if (j->eob_run) {
-         --j->eob_run;
-         return 1;
-      }
-
-      k = j->spec_start;
-      do {
-         unsigned int zig;
-         int c,r,s;
-         if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
-         c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
-         r = fac[c];
-         if (r) { // fast-AC path
-            k += (r >> 4) & 15; // run
-            s = r & 15; // combined length
-            if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
-            j->code_buffer <<= s;
-            j->code_bits -= s;
-            zig = stbi__jpeg_dezigzag[k++];
-            data[zig] = (short) ((r >> 8) * (1 << shift));
-         } else {
-            int rs = stbi__jpeg_huff_decode(j, hac);
-            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
-            s = rs & 15;
-            r = rs >> 4;
-            if (s == 0) {
-               if (r < 15) {
-                  j->eob_run = (1 << r);
-                  if (r)
-                     j->eob_run += stbi__jpeg_get_bits(j, r);
-                  --j->eob_run;
-                  break;
-               }
-               k += 16;
-            } else {
-               k += r;
-               zig = stbi__jpeg_dezigzag[k++];
-               data[zig] = (short) (stbi__extend_receive(j,s) * (1 << shift));
-            }
-         }
-      } while (k <= j->spec_end);
-   } else {
-      // refinement scan for these AC coefficients
-
-      short bit = (short) (1 << j->succ_low);
-
-      if (j->eob_run) {
-         --j->eob_run;
-         for (k = j->spec_start; k <= j->spec_end; ++k) {
-            short *p = &data[stbi__jpeg_dezigzag[k]];
-            if (*p != 0)
-               if (stbi__jpeg_get_bit(j))
-                  if ((*p & bit)==0) {
-                     if (*p > 0)
-                        *p += bit;
-                     else
-                        *p -= bit;
-                  }
-         }
-      } else {
-         k = j->spec_start;
-         do {
-            int r,s;
-            int rs = stbi__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
-            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
-            s = rs & 15;
-            r = rs >> 4;
-            if (s == 0) {
-               if (r < 15) {
-                  j->eob_run = (1 << r) - 1;
-                  if (r)
-                     j->eob_run += stbi__jpeg_get_bits(j, r);
-                  r = 64; // force end of block
-               } else {
-                  // r=15 s=0 should write 16 0s, so we just do
-                  // a run of 15 0s and then write s (which is 0),
-                  // so we don't have to do anything special here
-               }
-            } else {
-               if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG");
-               // sign bit
-               if (stbi__jpeg_get_bit(j))
-                  s = bit;
-               else
-                  s = -bit;
-            }
-
-            // advance by r
-            while (k <= j->spec_end) {
-               short *p = &data[stbi__jpeg_dezigzag[k++]];
-               if (*p != 0) {
-                  if (stbi__jpeg_get_bit(j))
-                     if ((*p & bit)==0) {
-                        if (*p > 0)
-                           *p += bit;
-                        else
-                           *p -= bit;
-                     }
-               } else {
-                  if (r == 0) {
-                     *p = (short) s;
-                     break;
-                  }
-                  --r;
-               }
-            }
-         } while (k <= j->spec_end);
-      }
-   }
-   return 1;
-}
-
-// take a -128..127 value and stbi__clamp it and convert to 0..255
-stbi_inline static stbi_uc stbi__clamp(int x)
-{
-   // trick to use a single test to catch both cases
-   if ((unsigned int) x > 255) {
-      if (x < 0) return 0;
-      if (x > 255) return 255;
-   }
-   return (stbi_uc) x;
-}
-
-#define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
-#define stbi__fsh(x)  ((x) * 4096)
-
-// derived from jidctint -- DCT_ISLOW
-#define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
-   int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
-   p2 = s2;                                    \
-   p3 = s6;                                    \
-   p1 = (p2+p3) * stbi__f2f(0.5411961f);       \
-   t2 = p1 + p3*stbi__f2f(-1.847759065f);      \
-   t3 = p1 + p2*stbi__f2f( 0.765366865f);      \
-   p2 = s0;                                    \
-   p3 = s4;                                    \
-   t0 = stbi__fsh(p2+p3);                      \
-   t1 = stbi__fsh(p2-p3);                      \
-   x0 = t0+t3;                                 \
-   x3 = t0-t3;                                 \
-   x1 = t1+t2;                                 \
-   x2 = t1-t2;                                 \
-   t0 = s7;                                    \
-   t1 = s5;                                    \
-   t2 = s3;                                    \
-   t3 = s1;                                    \
-   p3 = t0+t2;                                 \
-   p4 = t1+t3;                                 \
-   p1 = t0+t3;                                 \
-   p2 = t1+t2;                                 \
-   p5 = (p3+p4)*stbi__f2f( 1.175875602f);      \
-   t0 = t0*stbi__f2f( 0.298631336f);           \
-   t1 = t1*stbi__f2f( 2.053119869f);           \
-   t2 = t2*stbi__f2f( 3.072711026f);           \
-   t3 = t3*stbi__f2f( 1.501321110f);           \
-   p1 = p5 + p1*stbi__f2f(-0.899976223f);      \
-   p2 = p5 + p2*stbi__f2f(-2.562915447f);      \
-   p3 = p3*stbi__f2f(-1.961570560f);           \
-   p4 = p4*stbi__f2f(-0.390180644f);           \
-   t3 += p1+p4;                                \
-   t2 += p2+p3;                                \
-   t1 += p2+p4;                                \
-   t0 += p1+p3;
-
-static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
-{
-   int i,val[64],*v=val;
-   stbi_uc *o;
-   short *d = data;
-
-   // columns
-   for (i=0; i < 8; ++i,++d, ++v) {
-      // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
-      if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
-           && d[40]==0 && d[48]==0 && d[56]==0) {
-         //    no shortcut                 0     seconds
-         //    (1|2|3|4|5|6|7)==0          0     seconds
-         //    all separate               -0.047 seconds
-         //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
-         int dcterm = d[0]*4;
-         v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
-      } else {
-         STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
-         // constants scaled things up by 1<<12; let's bring them back
-         // down, but keep 2 extra bits of precision
-         x0 += 512; x1 += 512; x2 += 512; x3 += 512;
-         v[ 0] = (x0+t3) >> 10;
-         v[56] = (x0-t3) >> 10;
-         v[ 8] = (x1+t2) >> 10;
-         v[48] = (x1-t2) >> 10;
-         v[16] = (x2+t1) >> 10;
-         v[40] = (x2-t1) >> 10;
-         v[24] = (x3+t0) >> 10;
-         v[32] = (x3-t0) >> 10;
-      }
-   }
-
-   for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) {
-      // no fast case since the first 1D IDCT spread components out
-      STBI__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
-      // constants scaled things up by 1<<12, plus we had 1<<2 from first
-      // loop, plus horizontal and vertical each scale by sqrt(8) so together
-      // we've got an extra 1<<3, so 1<<17 total we need to remove.
-      // so we want to round that, which means adding 0.5 * 1<<17,
-      // aka 65536. Also, we'll end up with -128 to 127 that we want
-      // to encode as 0..255 by adding 128, so we'll add that before the shift
-      x0 += 65536 + (128<<17);
-      x1 += 65536 + (128<<17);
-      x2 += 65536 + (128<<17);
-      x3 += 65536 + (128<<17);
-      // tried computing the shifts into temps, or'ing the temps to see
-      // if any were out of range, but that was slower
-      o[0] = stbi__clamp((x0+t3) >> 17);
-      o[7] = stbi__clamp((x0-t3) >> 17);
-      o[1] = stbi__clamp((x1+t2) >> 17);
-      o[6] = stbi__clamp((x1-t2) >> 17);
-      o[2] = stbi__clamp((x2+t1) >> 17);
-      o[5] = stbi__clamp((x2-t1) >> 17);
-      o[3] = stbi__clamp((x3+t0) >> 17);
-      o[4] = stbi__clamp((x3-t0) >> 17);
-   }
-}
-
-#ifdef STBI_SSE2
-// sse2 integer IDCT. not the fastest possible implementation but it
-// produces bit-identical results to the generic C version so it's
-// fully "transparent".
-static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
-{
-   // This is constructed to match our regular (generic) integer IDCT exactly.
-   __m128i row0, row1, row2, row3, row4, row5, row6, row7;
-   __m128i tmp;
-
-   // dot product constant: even elems=x, odd elems=y
-   #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
-
-   // out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
-   // out(1) = c1[even]*x + c1[odd]*y
-   #define dct_rot(out0,out1, x,y,c0,c1) \
-      __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
-      __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
-      __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
-      __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
-      __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
-      __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
-
-   // out = in << 12  (in 16-bit, out 32-bit)
-   #define dct_widen(out, in) \
-      __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
-      __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
-
-   // wide add
-   #define dct_wadd(out, a, b) \
-      __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
-      __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
-
-   // wide sub
-   #define dct_wsub(out, a, b) \
-      __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
-      __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
-
-   // butterfly a/b, add bias, then shift by "s" and pack
-   #define dct_bfly32o(out0, out1, a,b,bias,s) \
-      { \
-         __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
-         __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
-         dct_wadd(sum, abiased, b); \
-         dct_wsub(dif, abiased, b); \
-         out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
-         out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
-      }
-
-   // 8-bit interleave step (for transposes)
-   #define dct_interleave8(a, b) \
-      tmp = a; \
-      a = _mm_unpacklo_epi8(a, b); \
-      b = _mm_unpackhi_epi8(tmp, b)
-
-   // 16-bit interleave step (for transposes)
-   #define dct_interleave16(a, b) \
-      tmp = a; \
-      a = _mm_unpacklo_epi16(a, b); \
-      b = _mm_unpackhi_epi16(tmp, b)
-
-   #define dct_pass(bias,shift) \
-      { \
-         /* even part */ \
-         dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
-         __m128i sum04 = _mm_add_epi16(row0, row4); \
-         __m128i dif04 = _mm_sub_epi16(row0, row4); \
-         dct_widen(t0e, sum04); \
-         dct_widen(t1e, dif04); \
-         dct_wadd(x0, t0e, t3e); \
-         dct_wsub(x3, t0e, t3e); \
-         dct_wadd(x1, t1e, t2e); \
-         dct_wsub(x2, t1e, t2e); \
-         /* odd part */ \
-         dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
-         dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
-         __m128i sum17 = _mm_add_epi16(row1, row7); \
-         __m128i sum35 = _mm_add_epi16(row3, row5); \
-         dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
-         dct_wadd(x4, y0o, y4o); \
-         dct_wadd(x5, y1o, y5o); \
-         dct_wadd(x6, y2o, y5o); \
-         dct_wadd(x7, y3o, y4o); \
-         dct_bfly32o(row0,row7, x0,x7,bias,shift); \
-         dct_bfly32o(row1,row6, x1,x6,bias,shift); \
-         dct_bfly32o(row2,row5, x2,x5,bias,shift); \
-         dct_bfly32o(row3,row4, x3,x4,bias,shift); \
-      }
-
-   __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
-   __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f));
-   __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
-   __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
-   __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f));
-   __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f));
-   __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f));
-   __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f));
-
-   // rounding biases in column/row passes, see stbi__idct_block for explanation.
-   __m128i bias_0 = _mm_set1_epi32(512);
-   __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
-
-   // load
-   row0 = _mm_load_si128((const __m128i *) (data + 0*8));
-   row1 = _mm_load_si128((const __m128i *) (data + 1*8));
-   row2 = _mm_load_si128((const __m128i *) (data + 2*8));
-   row3 = _mm_load_si128((const __m128i *) (data + 3*8));
-   row4 = _mm_load_si128((const __m128i *) (data + 4*8));
-   row5 = _mm_load_si128((const __m128i *) (data + 5*8));
-   row6 = _mm_load_si128((const __m128i *) (data + 6*8));
-   row7 = _mm_load_si128((const __m128i *) (data + 7*8));
-
-   // column pass
-   dct_pass(bias_0, 10);
-
-   {
-      // 16bit 8x8 transpose pass 1
-      dct_interleave16(row0, row4);
-      dct_interleave16(row1, row5);
-      dct_interleave16(row2, row6);
-      dct_interleave16(row3, row7);
-
-      // transpose pass 2
-      dct_interleave16(row0, row2);
-      dct_interleave16(row1, row3);
-      dct_interleave16(row4, row6);
-      dct_interleave16(row5, row7);
-
-      // transpose pass 3
-      dct_interleave16(row0, row1);
-      dct_interleave16(row2, row3);
-      dct_interleave16(row4, row5);
-      dct_interleave16(row6, row7);
-   }
-
-   // row pass
-   dct_pass(bias_1, 17);
-
-   {
-      // pack
-      __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
-      __m128i p1 = _mm_packus_epi16(row2, row3);
-      __m128i p2 = _mm_packus_epi16(row4, row5);
-      __m128i p3 = _mm_packus_epi16(row6, row7);
-
-      // 8bit 8x8 transpose pass 1
-      dct_interleave8(p0, p2); // a0e0a1e1...
-      dct_interleave8(p1, p3); // c0g0c1g1...
-
-      // transpose pass 2
-      dct_interleave8(p0, p1); // a0c0e0g0...
-      dct_interleave8(p2, p3); // b0d0f0h0...
-
-      // transpose pass 3
-      dct_interleave8(p0, p2); // a0b0c0d0...
-      dct_interleave8(p1, p3); // a4b4c4d4...
-
-      // store
-      _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
-      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
-      _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
-      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
-      _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
-      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
-      _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
-      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
-   }
-
-#undef dct_const
-#undef dct_rot
-#undef dct_widen
-#undef dct_wadd
-#undef dct_wsub
-#undef dct_bfly32o
-#undef dct_interleave8
-#undef dct_interleave16
-#undef dct_pass
-}
-
-#endif // STBI_SSE2
-
-#ifdef STBI_NEON
-
-// NEON integer IDCT. should produce bit-identical
-// results to the generic C version.
-static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
-{
-   int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
-
-   int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
-   int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
-   int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f));
-   int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f));
-   int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
-   int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
-   int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
-   int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
-   int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f));
-   int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f));
-   int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f));
-   int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f));
-
-#define dct_long_mul(out, inq, coeff) \
-   int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
-   int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
-
-#define dct_long_mac(out, acc, inq, coeff) \
-   int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
-   int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
-
-#define dct_widen(out, inq) \
-   int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
-   int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
-
-// wide add
-#define dct_wadd(out, a, b) \
-   int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
-   int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
-
-// wide sub
-#define dct_wsub(out, a, b) \
-   int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
-   int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
-
-// butterfly a/b, then shift using "shiftop" by "s" and pack
-#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
-   { \
-      dct_wadd(sum, a, b); \
-      dct_wsub(dif, a, b); \
-      out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
-      out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
-   }
-
-#define dct_pass(shiftop, shift) \
-   { \
-      /* even part */ \
-      int16x8_t sum26 = vaddq_s16(row2, row6); \
-      dct_long_mul(p1e, sum26, rot0_0); \
-      dct_long_mac(t2e, p1e, row6, rot0_1); \
-      dct_long_mac(t3e, p1e, row2, rot0_2); \
-      int16x8_t sum04 = vaddq_s16(row0, row4); \
-      int16x8_t dif04 = vsubq_s16(row0, row4); \
-      dct_widen(t0e, sum04); \
-      dct_widen(t1e, dif04); \
-      dct_wadd(x0, t0e, t3e); \
-      dct_wsub(x3, t0e, t3e); \
-      dct_wadd(x1, t1e, t2e); \
-      dct_wsub(x2, t1e, t2e); \
-      /* odd part */ \
-      int16x8_t sum15 = vaddq_s16(row1, row5); \
-      int16x8_t sum17 = vaddq_s16(row1, row7); \
-      int16x8_t sum35 = vaddq_s16(row3, row5); \
-      int16x8_t sum37 = vaddq_s16(row3, row7); \
-      int16x8_t sumodd = vaddq_s16(sum17, sum35); \
-      dct_long_mul(p5o, sumodd, rot1_0); \
-      dct_long_mac(p1o, p5o, sum17, rot1_1); \
-      dct_long_mac(p2o, p5o, sum35, rot1_2); \
-      dct_long_mul(p3o, sum37, rot2_0); \
-      dct_long_mul(p4o, sum15, rot2_1); \
-      dct_wadd(sump13o, p1o, p3o); \
-      dct_wadd(sump24o, p2o, p4o); \
-      dct_wadd(sump23o, p2o, p3o); \
-      dct_wadd(sump14o, p1o, p4o); \
-      dct_long_mac(x4, sump13o, row7, rot3_0); \
-      dct_long_mac(x5, sump24o, row5, rot3_1); \
-      dct_long_mac(x6, sump23o, row3, rot3_2); \
-      dct_long_mac(x7, sump14o, row1, rot3_3); \
-      dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
-      dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
-      dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
-      dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
-   }
-
-   // load
-   row0 = vld1q_s16(data + 0*8);
-   row1 = vld1q_s16(data + 1*8);
-   row2 = vld1q_s16(data + 2*8);
-   row3 = vld1q_s16(data + 3*8);
-   row4 = vld1q_s16(data + 4*8);
-   row5 = vld1q_s16(data + 5*8);
-   row6 = vld1q_s16(data + 6*8);
-   row7 = vld1q_s16(data + 7*8);
-
-   // add DC bias
-   row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
-
-   // column pass
-   dct_pass(vrshrn_n_s32, 10);
-
-   // 16bit 8x8 transpose
-   {
-// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
-// whether compilers actually get this is another story, sadly.
-#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
-#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
-#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
-
-      // pass 1
-      dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
-      dct_trn16(row2, row3);
-      dct_trn16(row4, row5);
-      dct_trn16(row6, row7);
-
-      // pass 2
-      dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
-      dct_trn32(row1, row3);
-      dct_trn32(row4, row6);
-      dct_trn32(row5, row7);
-
-      // pass 3
-      dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
-      dct_trn64(row1, row5);
-      dct_trn64(row2, row6);
-      dct_trn64(row3, row7);
-
-#undef dct_trn16
-#undef dct_trn32
-#undef dct_trn64
-   }
-
-   // row pass
-   // vrshrn_n_s32 only supports shifts up to 16, we need
-   // 17. so do a non-rounding shift of 16 first then follow
-   // up with a rounding shift by 1.
-   dct_pass(vshrn_n_s32, 16);
-
-   {
-      // pack and round
-      uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
-      uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
-      uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
-      uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
-      uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
-      uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
-      uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
-      uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
-
-      // again, these can translate into one instruction, but often don't.
-#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
-#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
-#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
-
-      // sadly can't use interleaved stores here since we only write
-      // 8 bytes to each scan line!
-
-      // 8x8 8-bit transpose pass 1
-      dct_trn8_8(p0, p1);
-      dct_trn8_8(p2, p3);
-      dct_trn8_8(p4, p5);
-      dct_trn8_8(p6, p7);
-
-      // pass 2
-      dct_trn8_16(p0, p2);
-      dct_trn8_16(p1, p3);
-      dct_trn8_16(p4, p6);
-      dct_trn8_16(p5, p7);
-
-      // pass 3
-      dct_trn8_32(p0, p4);
-      dct_trn8_32(p1, p5);
-      dct_trn8_32(p2, p6);
-      dct_trn8_32(p3, p7);
-
-      // store
-      vst1_u8(out, p0); out += out_stride;
-      vst1_u8(out, p1); out += out_stride;
-      vst1_u8(out, p2); out += out_stride;
-      vst1_u8(out, p3); out += out_stride;
-      vst1_u8(out, p4); out += out_stride;
-      vst1_u8(out, p5); out += out_stride;
-      vst1_u8(out, p6); out += out_stride;
-      vst1_u8(out, p7);
-
-#undef dct_trn8_8
-#undef dct_trn8_16
-#undef dct_trn8_32
-   }
-
-#undef dct_long_mul
-#undef dct_long_mac
-#undef dct_widen
-#undef dct_wadd
-#undef dct_wsub
-#undef dct_bfly32o
-#undef dct_pass
-}
-
-#endif // STBI_NEON
-
-#define STBI__MARKER_none  0xff
-// if there's a pending marker from the entropy stream, return that
-// otherwise, fetch from the stream and get a marker. if there's no
-// marker, return 0xff, which is never a valid marker value
-static stbi_uc stbi__get_marker(stbi__jpeg *j)
-{
-   stbi_uc x;
-   if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; }
-   x = stbi__get8(j->s);
-   if (x != 0xff) return STBI__MARKER_none;
-   while (x == 0xff)
-      x = stbi__get8(j->s); // consume repeated 0xff fill bytes
-   return x;
-}
-
-// in each scan, we'll have scan_n components, and the order
-// of the components is specified by order[]
-#define STBI__RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
-
-// after a restart interval, stbi__jpeg_reset the entropy decoder and
-// the dc prediction
-static void stbi__jpeg_reset(stbi__jpeg *j)
-{
-   j->code_bits = 0;
-   j->code_buffer = 0;
-   j->nomore = 0;
-   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
-   j->marker = STBI__MARKER_none;
-   j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
-   j->eob_run = 0;
-   // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
-   // since we don't even allow 1<<30 pixels
-}
-
-static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
-{
-   stbi__jpeg_reset(z);
-   if (!z->progressive) {
-      if (z->scan_n == 1) {
-         int i,j;
-         STBI_SIMD_ALIGN(short, data[64]);
-         int n = z->order[0];
-         // non-interleaved data, we just need to process one block at a time,
-         // in trivial scanline order
-         // number of blocks to do just depends on how many actual "pixels" this
-         // component has, independent of interleaved MCU blocking and such
-         int w = (z->img_comp[n].x+7) >> 3;
-         int h = (z->img_comp[n].y+7) >> 3;
-         for (j=0; j < h; ++j) {
-            for (i=0; i < w; ++i) {
-               int ha = z->img_comp[n].ha;
-               if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
-               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
-               // every data block is an MCU, so countdown the restart interval
-               if (--z->todo <= 0) {
-                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
-                  // if it's NOT a restart, then just bail, so we get corrupt data
-                  // rather than no data
-                  if (!STBI__RESTART(z->marker)) return 1;
-                  stbi__jpeg_reset(z);
-               }
-            }
-         }
-         return 1;
-      } else { // interleaved
-         int i,j,k,x,y;
-         STBI_SIMD_ALIGN(short, data[64]);
-         for (j=0; j < z->img_mcu_y; ++j) {
-            for (i=0; i < z->img_mcu_x; ++i) {
-               // scan an interleaved mcu... process scan_n components in order
-               for (k=0; k < z->scan_n; ++k) {
-                  int n = z->order[k];
-                  // scan out an mcu's worth of this component; that's just determined
-                  // by the basic H and V specified for the component
-                  for (y=0; y < z->img_comp[n].v; ++y) {
-                     for (x=0; x < z->img_comp[n].h; ++x) {
-                        int x2 = (i*z->img_comp[n].h + x)*8;
-                        int y2 = (j*z->img_comp[n].v + y)*8;
-                        int ha = z->img_comp[n].ha;
-                        if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
-                        z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data);
-                     }
-                  }
-               }
-               // after all interleaved components, that's an interleaved MCU,
-               // so now count down the restart interval
-               if (--z->todo <= 0) {
-                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
-                  if (!STBI__RESTART(z->marker)) return 1;
-                  stbi__jpeg_reset(z);
-               }
-            }
-         }
-         return 1;
-      }
-   } else {
-      if (z->scan_n == 1) {
-         int i,j;
-         int n = z->order[0];
-         // non-interleaved data, we just need to process one block at a time,
-         // in trivial scanline order
-         // number of blocks to do just depends on how many actual "pixels" this
-         // component has, independent of interleaved MCU blocking and such
-         int w = (z->img_comp[n].x+7) >> 3;
-         int h = (z->img_comp[n].y+7) >> 3;
-         for (j=0; j < h; ++j) {
-            for (i=0; i < w; ++i) {
-               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
-               if (z->spec_start == 0) {
-                  if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
-                     return 0;
-               } else {
-                  int ha = z->img_comp[n].ha;
-                  if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
-                     return 0;
-               }
-               // every data block is an MCU, so countdown the restart interval
-               if (--z->todo <= 0) {
-                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
-                  if (!STBI__RESTART(z->marker)) return 1;
-                  stbi__jpeg_reset(z);
-               }
-            }
-         }
-         return 1;
-      } else { // interleaved
-         int i,j,k,x,y;
-         for (j=0; j < z->img_mcu_y; ++j) {
-            for (i=0; i < z->img_mcu_x; ++i) {
-               // scan an interleaved mcu... process scan_n components in order
-               for (k=0; k < z->scan_n; ++k) {
-                  int n = z->order[k];
-                  // scan out an mcu's worth of this component; that's just determined
-                  // by the basic H and V specified for the component
-                  for (y=0; y < z->img_comp[n].v; ++y) {
-                     for (x=0; x < z->img_comp[n].h; ++x) {
-                        int x2 = (i*z->img_comp[n].h + x);
-                        int y2 = (j*z->img_comp[n].v + y);
-                        short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
-                        if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
-                           return 0;
-                     }
-                  }
-               }
-               // after all interleaved components, that's an interleaved MCU,
-               // so now count down the restart interval
-               if (--z->todo <= 0) {
-                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
-                  if (!STBI__RESTART(z->marker)) return 1;
-                  stbi__jpeg_reset(z);
-               }
-            }
-         }
-         return 1;
-      }
-   }
-}
-
-static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant)
-{
-   int i;
-   for (i=0; i < 64; ++i)
-      data[i] *= dequant[i];
-}
-
-static void stbi__jpeg_finish(stbi__jpeg *z)
-{
-   if (z->progressive) {
-      // dequantize and idct the data
-      int i,j,n;
-      for (n=0; n < z->s->img_n; ++n) {
-         int w = (z->img_comp[n].x+7) >> 3;
-         int h = (z->img_comp[n].y+7) >> 3;
-         for (j=0; j < h; ++j) {
-            for (i=0; i < w; ++i) {
-               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
-               stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
-               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
-            }
-         }
-      }
-   }
-}
-
-static int stbi__process_marker(stbi__jpeg *z, int m)
-{
-   int L;
-   switch (m) {
-      case STBI__MARKER_none: // no marker found
-         return stbi__err("expected marker","Corrupt JPEG");
-
-      case 0xDD: // DRI - specify restart interval
-         if (stbi__get16be(z->s) != 4) return stbi__err("bad DRI len","Corrupt JPEG");
-         z->restart_interval = stbi__get16be(z->s);
-         return 1;
-
-      case 0xDB: // DQT - define quantization table
-         L = stbi__get16be(z->s)-2;
-         while (L > 0) {
-            int q = stbi__get8(z->s);
-            int p = q >> 4, sixteen = (p != 0);
-            int t = q & 15,i;
-            if (p != 0 && p != 1) return stbi__err("bad DQT type","Corrupt JPEG");
-            if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
-
-            for (i=0; i < 64; ++i)
-               z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
-            L -= (sixteen ? 129 : 65);
-         }
-         return L==0;
-
-      case 0xC4: // DHT - define huffman table
-         L = stbi__get16be(z->s)-2;
-         while (L > 0) {
-            stbi_uc *v;
-            int sizes[16],i,n=0;
-            int q = stbi__get8(z->s);
-            int tc = q >> 4;
-            int th = q & 15;
-            if (tc > 1 || th > 3) return stbi__err("bad DHT header","Corrupt JPEG");
-            for (i=0; i < 16; ++i) {
-               sizes[i] = stbi__get8(z->s);
-               n += sizes[i];
-            }
-            if(n > 256) return stbi__err("bad DHT header","Corrupt JPEG"); // Loop over i < n would write past end of values!
-            L -= 17;
-            if (tc == 0) {
-               if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;
-               v = z->huff_dc[th].values;
-            } else {
-               if (!stbi__build_huffman(z->huff_ac+th, sizes)) return 0;
-               v = z->huff_ac[th].values;
-            }
-            for (i=0; i < n; ++i)
-               v[i] = stbi__get8(z->s);
-            if (tc != 0)
-               stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
-            L -= n;
-         }
-         return L==0;
-   }
-
-   // check for comment block or APP blocks
-   if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
-      L = stbi__get16be(z->s);
-      if (L < 2) {
-         if (m == 0xFE)
-            return stbi__err("bad COM len","Corrupt JPEG");
-         else
-            return stbi__err("bad APP len","Corrupt JPEG");
-      }
-      L -= 2;
-
-      if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
-         static const unsigned char tag[5] = {'J','F','I','F','\0'};
-         int ok = 1;
-         int i;
-         for (i=0; i < 5; ++i)
-            if (stbi__get8(z->s) != tag[i])
-               ok = 0;
-         L -= 5;
-         if (ok)
-            z->jfif = 1;
-      } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
-         static const unsigned char tag[6] = {'A','d','o','b','e','\0'};
-         int ok = 1;
-         int i;
-         for (i=0; i < 6; ++i)
-            if (stbi__get8(z->s) != tag[i])
-               ok = 0;
-         L -= 6;
-         if (ok) {
-            stbi__get8(z->s); // version
-            stbi__get16be(z->s); // flags0
-            stbi__get16be(z->s); // flags1
-            z->app14_color_transform = stbi__get8(z->s); // color transform
-            L -= 6;
-         }
-      }
-
-      stbi__skip(z->s, L);
-      return 1;
-   }
-
-   return stbi__err("unknown marker","Corrupt JPEG");
-}
-
-// after we see SOS
-static int stbi__process_scan_header(stbi__jpeg *z)
-{
-   int i;
-   int Ls = stbi__get16be(z->s);
-   z->scan_n = stbi__get8(z->s);
-   if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) return stbi__err("bad SOS component count","Corrupt JPEG");
-   if (Ls != 6+2*z->scan_n) return stbi__err("bad SOS len","Corrupt JPEG");
-   for (i=0; i < z->scan_n; ++i) {
-      int id = stbi__get8(z->s), which;
-      int q = stbi__get8(z->s);
-      for (which = 0; which < z->s->img_n; ++which)
-         if (z->img_comp[which].id == id)
-            break;
-      if (which == z->s->img_n) return 0; // no match
-      z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff","Corrupt JPEG");
-      z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff","Corrupt JPEG");
-      z->order[i] = which;
-   }
-
-   {
-      int aa;
-      z->spec_start = stbi__get8(z->s);
-      z->spec_end   = stbi__get8(z->s); // should be 63, but might be 0
-      aa = stbi__get8(z->s);
-      z->succ_high = (aa >> 4);
-      z->succ_low  = (aa & 15);
-      if (z->progressive) {
-         if (z->spec_start > 63 || z->spec_end > 63  || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
-            return stbi__err("bad SOS", "Corrupt JPEG");
-      } else {
-         if (z->spec_start != 0) return stbi__err("bad SOS","Corrupt JPEG");
-         if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS","Corrupt JPEG");
-         z->spec_end = 63;
-      }
-   }
-
-   return 1;
-}
-
-static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why)
-{
-   int i;
-   for (i=0; i < ncomp; ++i) {
-      if (z->img_comp[i].raw_data) {
-         STBI_FREE(z->img_comp[i].raw_data);
-         z->img_comp[i].raw_data = NULL;
-         z->img_comp[i].data = NULL;
-      }
-      if (z->img_comp[i].raw_coeff) {
-         STBI_FREE(z->img_comp[i].raw_coeff);
-         z->img_comp[i].raw_coeff = 0;
-         z->img_comp[i].coeff = 0;
-      }
-      if (z->img_comp[i].linebuf) {
-         STBI_FREE(z->img_comp[i].linebuf);
-         z->img_comp[i].linebuf = NULL;
-      }
-   }
-   return why;
-}
-
-static int stbi__process_frame_header(stbi__jpeg *z, int scan)
-{
-   stbi__context *s = z->s;
-   int Lf,p,i,q, h_max=1,v_max=1,c;
-   Lf = stbi__get16be(s);         if (Lf < 11) return stbi__err("bad SOF len","Corrupt JPEG"); // JPEG
-   p  = stbi__get8(s);            if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
-   s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
-   s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
-   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
-   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
-   c = stbi__get8(s);
-   if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG");
-   s->img_n = c;
-   for (i=0; i < c; ++i) {
-      z->img_comp[i].data = NULL;
-      z->img_comp[i].linebuf = NULL;
-   }
-
-   if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG");
-
-   z->rgb = 0;
-   for (i=0; i < s->img_n; ++i) {
-      static const unsigned char rgb[3] = { 'R', 'G', 'B' };
-      z->img_comp[i].id = stbi__get8(s);
-      if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
-         ++z->rgb;
-      q = stbi__get8(s);
-      z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG");
-      z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG");
-      z->img_comp[i].tq = stbi__get8(s);  if (z->img_comp[i].tq > 3) return stbi__err("bad TQ","Corrupt JPEG");
-   }
-
-   if (scan != STBI__SCAN_load) return 1;
-
-   if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode");
-
-   for (i=0; i < s->img_n; ++i) {
-      if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
-      if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
-   }
-
-   // check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios
-   // and I've never seen a non-corrupted JPEG file actually use them
-   for (i=0; i < s->img_n; ++i) {
-      if (h_max % z->img_comp[i].h != 0) return stbi__err("bad H","Corrupt JPEG");
-      if (v_max % z->img_comp[i].v != 0) return stbi__err("bad V","Corrupt JPEG");
-   }
-
-   // compute interleaved mcu info
-   z->img_h_max = h_max;
-   z->img_v_max = v_max;
-   z->img_mcu_w = h_max * 8;
-   z->img_mcu_h = v_max * 8;
-   // these sizes can't be more than 17 bits
-   z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
-   z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
-
-   for (i=0; i < s->img_n; ++i) {
-      // number of effective pixels (e.g. for non-interleaved MCU)
-      z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
-      z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
-      // to simplify generation, we'll allocate enough memory to decode
-      // the bogus oversized data from using interleaved MCUs and their
-      // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
-      // discard the extra data until colorspace conversion
-      //
-      // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
-      // so these muls can't overflow with 32-bit ints (which we require)
-      z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
-      z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
-      z->img_comp[i].coeff = 0;
-      z->img_comp[i].raw_coeff = 0;
-      z->img_comp[i].linebuf = NULL;
-      z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
-      if (z->img_comp[i].raw_data == NULL)
-         return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
-      // align blocks for idct using mmx/sse
-      z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
-      if (z->progressive) {
-         // w2, h2 are multiples of 8 (see above)
-         z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
-         z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
-         z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
-         if (z->img_comp[i].raw_coeff == NULL)
-            return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
-         z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
-      }
-   }
-
-   return 1;
-}
-
-// use comparisons since in some cases we handle more than one case (e.g. SOF)
-#define stbi__DNL(x)         ((x) == 0xdc)
-#define stbi__SOI(x)         ((x) == 0xd8)
-#define stbi__EOI(x)         ((x) == 0xd9)
-#define stbi__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
-#define stbi__SOS(x)         ((x) == 0xda)
-
-#define stbi__SOF_progressive(x)   ((x) == 0xc2)
-
-static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
-{
-   int m;
-   z->jfif = 0;
-   z->app14_color_transform = -1; // valid values are 0,1,2
-   z->marker = STBI__MARKER_none; // initialize cached marker to empty
-   m = stbi__get_marker(z);
-   if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
-   if (scan == STBI__SCAN_type) return 1;
-   m = stbi__get_marker(z);
-   while (!stbi__SOF(m)) {
-      if (!stbi__process_marker(z,m)) return 0;
-      m = stbi__get_marker(z);
-      while (m == STBI__MARKER_none) {
-         // some files have extra padding after their blocks, so ok, we'll scan
-         if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG");
-         m = stbi__get_marker(z);
-      }
-   }
-   z->progressive = stbi__SOF_progressive(m);
-   if (!stbi__process_frame_header(z, scan)) return 0;
-   return 1;
-}
-
-static int stbi__skip_jpeg_junk_at_end(stbi__jpeg *j)
-{
-   // some JPEGs have junk at end, skip over it but if we find what looks
-   // like a valid marker, resume there
-   while (!stbi__at_eof(j->s)) {
-      int x = stbi__get8(j->s);
-      while (x == 255) { // might be a marker
-         if (stbi__at_eof(j->s)) return STBI__MARKER_none;
-         x = stbi__get8(j->s);
-         if (x != 0x00 && x != 0xff) {
-            // not a stuffed zero or lead-in to another marker, looks
-            // like an actual marker, return it
-            return x;
-         }
-         // stuffed zero has x=0 now which ends the loop, meaning we go
-         // back to regular scan loop.
-         // repeated 0xff keeps trying to read the next byte of the marker.
-      }
-   }
-   return STBI__MARKER_none;
-}
-
-// decode image to YCbCr format
-static int stbi__decode_jpeg_image(stbi__jpeg *j)
-{
-   int m;
-   for (m = 0; m < 4; m++) {
-      j->img_comp[m].raw_data = NULL;
-      j->img_comp[m].raw_coeff = NULL;
-   }
-   j->restart_interval = 0;
-   if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0;
-   m = stbi__get_marker(j);
-   while (!stbi__EOI(m)) {
-      if (stbi__SOS(m)) {
-         if (!stbi__process_scan_header(j)) return 0;
-         if (!stbi__parse_entropy_coded_data(j)) return 0;
-         if (j->marker == STBI__MARKER_none ) {
-         j->marker = stbi__skip_jpeg_junk_at_end(j);
-            // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
-         }
-         m = stbi__get_marker(j);
-         if (STBI__RESTART(m))
-            m = stbi__get_marker(j);
-      } else if (stbi__DNL(m)) {
-         int Ld = stbi__get16be(j->s);
-         stbi__uint32 NL = stbi__get16be(j->s);
-         if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
-         if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
-         m = stbi__get_marker(j);
-      } else {
-         if (!stbi__process_marker(j, m)) return 1;
-         m = stbi__get_marker(j);
-      }
-   }
-   if (j->progressive)
-      stbi__jpeg_finish(j);
-   return 1;
-}
-
-// static jfif-centered resampling (across block boundaries)
-
-typedef stbi_uc *(*resample_row_func)(stbi_uc *out, stbi_uc *in0, stbi_uc *in1,
-                                    int w, int hs);
-
-#define stbi__div4(x) ((stbi_uc) ((x) >> 2))
-
-static stbi_uc *resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
-{
-   STBI_NOTUSED(out);
-   STBI_NOTUSED(in_far);
-   STBI_NOTUSED(w);
-   STBI_NOTUSED(hs);
-   return in_near;
-}
-
-static stbi_uc* stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
-{
-   // need to generate two samples vertically for every one in input
-   int i;
-   STBI_NOTUSED(hs);
-   for (i=0; i < w; ++i)
-      out[i] = stbi__div4(3*in_near[i] + in_far[i] + 2);
-   return out;
-}
-
-static stbi_uc*  stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
-{
-   // need to generate two samples horizontally for every one in input
-   int i;
-   stbi_uc *input = in_near;
-
-   if (w == 1) {
-      // if only one sample, can't do any interpolation
-      out[0] = out[1] = input[0];
-      return out;
-   }
-
-   out[0] = input[0];
-   out[1] = stbi__div4(input[0]*3 + input[1] + 2);
-   for (i=1; i < w-1; ++i) {
-      int n = 3*input[i]+2;
-      out[i*2+0] = stbi__div4(n+input[i-1]);
-      out[i*2+1] = stbi__div4(n+input[i+1]);
-   }
-   out[i*2+0] = stbi__div4(input[w-2]*3 + input[w-1] + 2);
-   out[i*2+1] = input[w-1];
-
-   STBI_NOTUSED(in_far);
-   STBI_NOTUSED(hs);
-
-   return out;
-}
-
-#define stbi__div16(x) ((stbi_uc) ((x) >> 4))
-
-static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
-{
-   // need to generate 2x2 samples for every one in input
-   int i,t0,t1;
-   if (w == 1) {
-      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
-      return out;
-   }
-
-   t1 = 3*in_near[0] + in_far[0];
-   out[0] = stbi__div4(t1+2);
-   for (i=1; i < w; ++i) {
-      t0 = t1;
-      t1 = 3*in_near[i]+in_far[i];
-      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
-      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
-   }
-   out[w*2-1] = stbi__div4(t1+2);
-
-   STBI_NOTUSED(hs);
-
-   return out;
-}
-
-#if defined(STBI_SSE2) || defined(STBI_NEON)
-static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
-{
-   // need to generate 2x2 samples for every one in input
-   int i=0,t0,t1;
-
-   if (w == 1) {
-      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
-      return out;
-   }
-
-   t1 = 3*in_near[0] + in_far[0];
-   // process groups of 8 pixels for as long as we can.
-   // note we can't handle the last pixel in a row in this loop
-   // because we need to handle the filter boundary conditions.
-   for (; i < ((w-1) & ~7); i += 8) {
-#if defined(STBI_SSE2)
-      // load and perform the vertical filtering pass
-      // this uses 3*x + y = 4*x + (y - x)
-      __m128i zero  = _mm_setzero_si128();
-      __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
-      __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
-      __m128i farw  = _mm_unpacklo_epi8(farb, zero);
-      __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
-      __m128i diff  = _mm_sub_epi16(farw, nearw);
-      __m128i nears = _mm_slli_epi16(nearw, 2);
-      __m128i curr  = _mm_add_epi16(nears, diff); // current row
-
-      // horizontal filter works the same based on shifted vers of current
-      // row. "prev" is current row shifted right by 1 pixel; we need to
-      // insert the previous pixel value (from t1).
-      // "next" is current row shifted left by 1 pixel, with first pixel
-      // of next block of 8 pixels added in.
-      __m128i prv0 = _mm_slli_si128(curr, 2);
-      __m128i nxt0 = _mm_srli_si128(curr, 2);
-      __m128i prev = _mm_insert_epi16(prv0, t1, 0);
-      __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
-
-      // horizontal filter, polyphase implementation since it's convenient:
-      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
-      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
-      // note the shared term.
-      __m128i bias  = _mm_set1_epi16(8);
-      __m128i curs = _mm_slli_epi16(curr, 2);
-      __m128i prvd = _mm_sub_epi16(prev, curr);
-      __m128i nxtd = _mm_sub_epi16(next, curr);
-      __m128i curb = _mm_add_epi16(curs, bias);
-      __m128i even = _mm_add_epi16(prvd, curb);
-      __m128i odd  = _mm_add_epi16(nxtd, curb);
-
-      // interleave even and odd pixels, then undo scaling.
-      __m128i int0 = _mm_unpacklo_epi16(even, odd);
-      __m128i int1 = _mm_unpackhi_epi16(even, odd);
-      __m128i de0  = _mm_srli_epi16(int0, 4);
-      __m128i de1  = _mm_srli_epi16(int1, 4);
-
-      // pack and write output
-      __m128i outv = _mm_packus_epi16(de0, de1);
-      _mm_storeu_si128((__m128i *) (out + i*2), outv);
-#elif defined(STBI_NEON)
-      // load and perform the vertical filtering pass
-      // this uses 3*x + y = 4*x + (y - x)
-      uint8x8_t farb  = vld1_u8(in_far + i);
-      uint8x8_t nearb = vld1_u8(in_near + i);
-      int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
-      int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
-      int16x8_t curr  = vaddq_s16(nears, diff); // current row
-
-      // horizontal filter works the same based on shifted vers of current
-      // row. "prev" is current row shifted right by 1 pixel; we need to
-      // insert the previous pixel value (from t1).
-      // "next" is current row shifted left by 1 pixel, with first pixel
-      // of next block of 8 pixels added in.
-      int16x8_t prv0 = vextq_s16(curr, curr, 7);
-      int16x8_t nxt0 = vextq_s16(curr, curr, 1);
-      int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
-      int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
-
-      // horizontal filter, polyphase implementation since it's convenient:
-      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
-      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
-      // note the shared term.
-      int16x8_t curs = vshlq_n_s16(curr, 2);
-      int16x8_t prvd = vsubq_s16(prev, curr);
-      int16x8_t nxtd = vsubq_s16(next, curr);
-      int16x8_t even = vaddq_s16(curs, prvd);
-      int16x8_t odd  = vaddq_s16(curs, nxtd);
-
-      // undo scaling and round, then store with even/odd phases interleaved
-      uint8x8x2_t o;
-      o.val[0] = vqrshrun_n_s16(even, 4);
-      o.val[1] = vqrshrun_n_s16(odd,  4);
-      vst2_u8(out + i*2, o);
-#endif
-
-      // "previous" value for next iter
-      t1 = 3*in_near[i+7] + in_far[i+7];
-   }
-
-   t0 = t1;
-   t1 = 3*in_near[i] + in_far[i];
-   out[i*2] = stbi__div16(3*t1 + t0 + 8);
-
-   for (++i; i < w; ++i) {
-      t0 = t1;
-      t1 = 3*in_near[i]+in_far[i];
-      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
-      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
-   }
-   out[w*2-1] = stbi__div4(t1+2);
-
-   STBI_NOTUSED(hs);
-
-   return out;
-}
-#endif
-
-static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
-{
-   // resample with nearest-neighbor
-   int i,j;
-   STBI_NOTUSED(in_far);
-   for (i=0; i < w; ++i)
-      for (j=0; j < hs; ++j)
-         out[i*hs+j] = in_near[i];
-   return out;
-}
-
-// this is a reduced-precision calculation of YCbCr-to-RGB introduced
-// to make sure the code produces the same results in both SIMD and scalar
-#define stbi__float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
-static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
-{
-   int i;
-   for (i=0; i < count; ++i) {
-      int y_fixed = (y[i] << 20) + (1<<19); // rounding
-      int r,g,b;
-      int cr = pcr[i] - 128;
-      int cb = pcb[i] - 128;
-      r = y_fixed +  cr* stbi__float2fixed(1.40200f);
-      g = y_fixed + (cr*-stbi__float2fixed(0.71414f)) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
-      b = y_fixed                                     +   cb* stbi__float2fixed(1.77200f);
-      r >>= 20;
-      g >>= 20;
-      b >>= 20;
-      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
-      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
-      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
-      out[0] = (stbi_uc)r;
-      out[1] = (stbi_uc)g;
-      out[2] = (stbi_uc)b;
-      out[3] = 255;
-      out += step;
-   }
-}
-
-#if defined(STBI_SSE2) || defined(STBI_NEON)
-static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
-{
-   int i = 0;
-
-#ifdef STBI_SSE2
-   // step == 3 is pretty ugly on the final interleave, and i'm not convinced
-   // it's useful in practice (you wouldn't use it for textures, for example).
-   // so just accelerate step == 4 case.
-   if (step == 4) {
-      // this is a fairly straightforward implementation and not super-optimized.
-      __m128i signflip  = _mm_set1_epi8(-0x80);
-      __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
-      __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
-      __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
-      __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
-      __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
-      __m128i xw = _mm_set1_epi16(255); // alpha channel
-
-      for (; i+7 < count; i += 8) {
-         // load
-         __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
-         __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
-         __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
-         __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
-         __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
-
-         // unpack to short (and left-shift cr, cb by 8)
-         __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
-         __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
-         __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
-
-         // color transform
-         __m128i yws = _mm_srli_epi16(yw, 4);
-         __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
-         __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
-         __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
-         __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
-         __m128i rws = _mm_add_epi16(cr0, yws);
-         __m128i gwt = _mm_add_epi16(cb0, yws);
-         __m128i bws = _mm_add_epi16(yws, cb1);
-         __m128i gws = _mm_add_epi16(gwt, cr1);
-
-         // descale
-         __m128i rw = _mm_srai_epi16(rws, 4);
-         __m128i bw = _mm_srai_epi16(bws, 4);
-         __m128i gw = _mm_srai_epi16(gws, 4);
-
-         // back to byte, set up for transpose
-         __m128i brb = _mm_packus_epi16(rw, bw);
-         __m128i gxb = _mm_packus_epi16(gw, xw);
-
-         // transpose to interleave channels
-         __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
-         __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
-         __m128i o0 = _mm_unpacklo_epi16(t0, t1);
-         __m128i o1 = _mm_unpackhi_epi16(t0, t1);
-
-         // store
-         _mm_storeu_si128((__m128i *) (out + 0), o0);
-         _mm_storeu_si128((__m128i *) (out + 16), o1);
-         out += 32;
-      }
-   }
-#endif
-
-#ifdef STBI_NEON
-   // in this version, step=3 support would be easy to add. but is there demand?
-   if (step == 4) {
-      // this is a fairly straightforward implementation and not super-optimized.
-      uint8x8_t signflip = vdup_n_u8(0x80);
-      int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
-      int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
-      int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
-      int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
-
-      for (; i+7 < count; i += 8) {
-         // load
-         uint8x8_t y_bytes  = vld1_u8(y + i);
-         uint8x8_t cr_bytes = vld1_u8(pcr + i);
-         uint8x8_t cb_bytes = vld1_u8(pcb + i);
-         int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
-         int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
-
-         // expand to s16
-         int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
-         int16x8_t crw = vshll_n_s8(cr_biased, 7);
-         int16x8_t cbw = vshll_n_s8(cb_biased, 7);
-
-         // color transform
-         int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
-         int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
-         int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
-         int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
-         int16x8_t rws = vaddq_s16(yws, cr0);
-         int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
-         int16x8_t bws = vaddq_s16(yws, cb1);
-
-         // undo scaling, round, convert to byte
-         uint8x8x4_t o;
-         o.val[0] = vqrshrun_n_s16(rws, 4);
-         o.val[1] = vqrshrun_n_s16(gws, 4);
-         o.val[2] = vqrshrun_n_s16(bws, 4);
-         o.val[3] = vdup_n_u8(255);
-
-         // store, interleaving r/g/b/a
-         vst4_u8(out, o);
-         out += 8*4;
-      }
-   }
-#endif
-
-   for (; i < count; ++i) {
-      int y_fixed = (y[i] << 20) + (1<<19); // rounding
-      int r,g,b;
-      int cr = pcr[i] - 128;
-      int cb = pcb[i] - 128;
-      r = y_fixed + cr* stbi__float2fixed(1.40200f);
-      g = y_fixed + cr*-stbi__float2fixed(0.71414f) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
-      b = y_fixed                                   +   cb* stbi__float2fixed(1.77200f);
-      r >>= 20;
-      g >>= 20;
-      b >>= 20;
-      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
-      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
-      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
-      out[0] = (stbi_uc)r;
-      out[1] = (stbi_uc)g;
-      out[2] = (stbi_uc)b;
-      out[3] = 255;
-      out += step;
-   }
-}
-#endif
-
-// set up the kernels
-static void stbi__setup_jpeg(stbi__jpeg *j)
-{
-   j->idct_block_kernel = stbi__idct_block;
-   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
-   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
-
-#ifdef STBI_SSE2
-   if (stbi__sse2_available()) {
-      j->idct_block_kernel = stbi__idct_simd;
-      j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
-      j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
-   }
-#endif
-
-#ifdef STBI_NEON
-   j->idct_block_kernel = stbi__idct_simd;
-   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
-   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
-#endif
-}
-
-// clean up the temporary component buffers
-static void stbi__cleanup_jpeg(stbi__jpeg *j)
-{
-   stbi__free_jpeg_components(j, j->s->img_n, 0);
-}
-
-typedef struct
-{
-   resample_row_func resample;
-   stbi_uc *line0,*line1;
-   int hs,vs;   // expansion factor in each axis
-   int w_lores; // horizontal pixels pre-expansion
-   int ystep;   // how far through vertical expansion we are
-   int ypos;    // which pre-expansion row we're on
-} stbi__resample;
-
-// fast 0..255 * 0..255 => 0..255 rounded multiplication
-static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y)
-{
-   unsigned int t = x*y + 128;
-   return (stbi_uc) ((t + (t >>8)) >> 8);
-}
-
-static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
-{
-   int n, decode_n, is_rgb;
-   z->s->img_n = 0; // make stbi__cleanup_jpeg safe
-
-   // validate req_comp
-   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
-
-   // load a jpeg image from whichever source, but leave in YCbCr format
-   if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
-
-   // determine actual number of components to generate
-   n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
-
-   is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
-
-   if (z->s->img_n == 3 && n < 3 && !is_rgb)
-      decode_n = 1;
-   else
-      decode_n = z->s->img_n;
-
-   // nothing to do if no components requested; check this now to avoid
-   // accessing uninitialized coutput[0] later
-   if (decode_n <= 0) { stbi__cleanup_jpeg(z); return NULL; }
-
-   // resample and color-convert
-   {
-      int k;
-      unsigned int i,j;
-      stbi_uc *output;
-      stbi_uc *coutput[4] = { NULL, NULL, NULL, NULL };
-
-      stbi__resample res_comp[4];
-
-      for (k=0; k < decode_n; ++k) {
-         stbi__resample *r = &res_comp[k];
-
-         // allocate line buffer big enough for upsampling off the edges
-         // with upsample factor of 4
-         z->img_comp[k].linebuf = (stbi_uc *) stbi__malloc(z->s->img_x + 3);
-         if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
-
-         r->hs      = z->img_h_max / z->img_comp[k].h;
-         r->vs      = z->img_v_max / z->img_comp[k].v;
-         r->ystep   = r->vs >> 1;
-         r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
-         r->ypos    = 0;
-         r->line0   = r->line1 = z->img_comp[k].data;
-
-         if      (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
-         else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2;
-         else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2;
-         else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel;
-         else                               r->resample = stbi__resample_row_generic;
-      }
-
-      // can't error after this so, this is safe
-      output = (stbi_uc *) stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
-      if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
-
-      // now go ahead and resample
-      for (j=0; j < z->s->img_y; ++j) {
-         stbi_uc *out = output + n * z->s->img_x * j;
-         for (k=0; k < decode_n; ++k) {
-            stbi__resample *r = &res_comp[k];
-            int y_bot = r->ystep >= (r->vs >> 1);
-            coutput[k] = r->resample(z->img_comp[k].linebuf,
-                                     y_bot ? r->line1 : r->line0,
-                                     y_bot ? r->line0 : r->line1,
-                                     r->w_lores, r->hs);
-            if (++r->ystep >= r->vs) {
-               r->ystep = 0;
-               r->line0 = r->line1;
-               if (++r->ypos < z->img_comp[k].y)
-                  r->line1 += z->img_comp[k].w2;
-            }
-         }
-         if (n >= 3) {
-            stbi_uc *y = coutput[0];
-            if (z->s->img_n == 3) {
-               if (is_rgb) {
-                  for (i=0; i < z->s->img_x; ++i) {
-                     out[0] = y[i];
-                     out[1] = coutput[1][i];
-                     out[2] = coutput[2][i];
-                     out[3] = 255;
-                     out += n;
-                  }
-               } else {
-                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
-               }
-            } else if (z->s->img_n == 4) {
-               if (z->app14_color_transform == 0) { // CMYK
-                  for (i=0; i < z->s->img_x; ++i) {
-                     stbi_uc m = coutput[3][i];
-                     out[0] = stbi__blinn_8x8(coutput[0][i], m);
-                     out[1] = stbi__blinn_8x8(coutput[1][i], m);
-                     out[2] = stbi__blinn_8x8(coutput[2][i], m);
-                     out[3] = 255;
-                     out += n;
-                  }
-               } else if (z->app14_color_transform == 2) { // YCCK
-                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
-                  for (i=0; i < z->s->img_x; ++i) {
-                     stbi_uc m = coutput[3][i];
-                     out[0] = stbi__blinn_8x8(255 - out[0], m);
-                     out[1] = stbi__blinn_8x8(255 - out[1], m);
-                     out[2] = stbi__blinn_8x8(255 - out[2], m);
-                     out += n;
-                  }
-               } else { // YCbCr + alpha?  Ignore the fourth channel for now
-                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
-               }
-            } else
-               for (i=0; i < z->s->img_x; ++i) {
-                  out[0] = out[1] = out[2] = y[i];
-                  out[3] = 255; // not used if n==3
-                  out += n;
-               }
-         } else {
-            if (is_rgb) {
-               if (n == 1)
-                  for (i=0; i < z->s->img_x; ++i)
-                     *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
-               else {
-                  for (i=0; i < z->s->img_x; ++i, out += 2) {
-                     out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
-                     out[1] = 255;
-                  }
-               }
-            } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
-               for (i=0; i < z->s->img_x; ++i) {
-                  stbi_uc m = coutput[3][i];
-                  stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
-                  stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
-                  stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
-                  out[0] = stbi__compute_y(r, g, b);
-                  out[1] = 255;
-                  out += n;
-               }
-            } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
-               for (i=0; i < z->s->img_x; ++i) {
-                  out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
-                  out[1] = 255;
-                  out += n;
-               }
-            } else {
-               stbi_uc *y = coutput[0];
-               if (n == 1)
-                  for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
-               else
-                  for (i=0; i < z->s->img_x; ++i) { *out++ = y[i]; *out++ = 255; }
-            }
-         }
-      }
-      stbi__cleanup_jpeg(z);
-      *out_x = z->s->img_x;
-      *out_y = z->s->img_y;
-      if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
-      return output;
-   }
-}
-
-static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
-{
-   unsigned char* result;
-   stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
-   if (!j) return stbi__errpuc("outofmem", "Out of memory");
-   memset(j, 0, sizeof(stbi__jpeg));
-   STBI_NOTUSED(ri);
-   j->s = s;
-   stbi__setup_jpeg(j);
-   result = load_jpeg_image(j, x,y,comp,req_comp);
-   STBI_FREE(j);
-   return result;
-}
-
-static int stbi__jpeg_test(stbi__context *s)
-{
-   int r;
-   stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
-   if (!j) return stbi__err("outofmem", "Out of memory");
-   memset(j, 0, sizeof(stbi__jpeg));
-   j->s = s;
-   stbi__setup_jpeg(j);
-   r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
-   stbi__rewind(s);
-   STBI_FREE(j);
-   return r;
-}
-
-static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
-{
-   if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
-      stbi__rewind( j->s );
-      return 0;
-   }
-   if (x) *x = j->s->img_x;
-   if (y) *y = j->s->img_y;
-   if (comp) *comp = j->s->img_n >= 3 ? 3 : 1;
-   return 1;
-}
-
-static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   int result;
-   stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
-   if (!j) return stbi__err("outofmem", "Out of memory");
-   memset(j, 0, sizeof(stbi__jpeg));
-   j->s = s;
-   result = stbi__jpeg_info_raw(j, x, y, comp);
-   STBI_FREE(j);
-   return result;
-}
-#endif
-
-// public domain zlib decode    v0.2  Sean Barrett 2006-11-18
-//    simple implementation
-//      - all input must be provided in an upfront buffer
-//      - all output is written to a single output buffer (can malloc/realloc)
-//    performance
-//      - fast huffman
-
-#ifndef STBI_NO_ZLIB
-
-// fast-way is faster to check than jpeg huffman, but slow way is slower
-#define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
-#define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
-#define STBI__ZNSYMS 288 // number of symbols in literal/length alphabet
-
-// zlib-style huffman encoding
-// (jpegs packs from left, zlib from right, so can't share code)
-typedef struct
-{
-   stbi__uint16 fast[1 << STBI__ZFAST_BITS];
-   stbi__uint16 firstcode[16];
-   int maxcode[17];
-   stbi__uint16 firstsymbol[16];
-   stbi_uc  size[STBI__ZNSYMS];
-   stbi__uint16 value[STBI__ZNSYMS];
-} stbi__zhuffman;
-
-stbi_inline static int stbi__bitreverse16(int n)
-{
-  n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
-  n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
-  n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
-  n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
-  return n;
-}
-
-stbi_inline static int stbi__bit_reverse(int v, int bits)
-{
-   STBI_ASSERT(bits <= 16);
-   // to bit reverse n bits, reverse 16 and shift
-   // e.g. 11 bits, bit reverse and shift away 5
-   return stbi__bitreverse16(v) >> (16-bits);
-}
-
-static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int num)
-{
-   int i,k=0;
-   int code, next_code[16], sizes[17];
-
-   // DEFLATE spec for generating codes
-   memset(sizes, 0, sizeof(sizes));
-   memset(z->fast, 0, sizeof(z->fast));
-   for (i=0; i < num; ++i)
-      ++sizes[sizelist[i]];
-   sizes[0] = 0;
-   for (i=1; i < 16; ++i)
-      if (sizes[i] > (1 << i))
-         return stbi__err("bad sizes", "Corrupt PNG");
-   code = 0;
-   for (i=1; i < 16; ++i) {
-      next_code[i] = code;
-      z->firstcode[i] = (stbi__uint16) code;
-      z->firstsymbol[i] = (stbi__uint16) k;
-      code = (code + sizes[i]);
-      if (sizes[i])
-         if (code-1 >= (1 << i)) return stbi__err("bad codelengths","Corrupt PNG");
-      z->maxcode[i] = code << (16-i); // preshift for inner loop
-      code <<= 1;
-      k += sizes[i];
-   }
-   z->maxcode[16] = 0x10000; // sentinel
-   for (i=0; i < num; ++i) {
-      int s = sizelist[i];
-      if (s) {
-         int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
-         stbi__uint16 fastv = (stbi__uint16) ((s << 9) | i);
-         z->size [c] = (stbi_uc     ) s;
-         z->value[c] = (stbi__uint16) i;
-         if (s <= STBI__ZFAST_BITS) {
-            int j = stbi__bit_reverse(next_code[s],s);
-            while (j < (1 << STBI__ZFAST_BITS)) {
-               z->fast[j] = fastv;
-               j += (1 << s);
-            }
-         }
-         ++next_code[s];
-      }
-   }
-   return 1;
-}
-
-// zlib-from-memory implementation for PNG reading
-//    because PNG allows splitting the zlib stream arbitrarily,
-//    and it's annoying structurally to have PNG call ZLIB call PNG,
-//    we require PNG read all the IDATs and combine them into a single
-//    memory buffer
-
-typedef struct
-{
-   stbi_uc *zbuffer, *zbuffer_end;
-   int num_bits;
-   stbi__uint32 code_buffer;
-
-   char *zout;
-   char *zout_start;
-   char *zout_end;
-   int   z_expandable;
-
-   stbi__zhuffman z_length, z_distance;
-} stbi__zbuf;
-
-stbi_inline static int stbi__zeof(stbi__zbuf *z)
-{
-   return (z->zbuffer >= z->zbuffer_end);
-}
-
-stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z)
-{
-   return stbi__zeof(z) ? 0 : *z->zbuffer++;
-}
-
-static void stbi__fill_bits(stbi__zbuf *z)
-{
-   do {
-      if (z->code_buffer >= (1U << z->num_bits)) {
-        z->zbuffer = z->zbuffer_end;  /* treat this as EOF so we fail. */
-        return;
-      }
-      z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits;
-      z->num_bits += 8;
-   } while (z->num_bits <= 24);
-}
-
-stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n)
-{
-   unsigned int k;
-   if (z->num_bits < n) stbi__fill_bits(z);
-   k = z->code_buffer & ((1 << n) - 1);
-   z->code_buffer >>= n;
-   z->num_bits -= n;
-   return k;
-}
-
-static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
-{
-   int b,s,k;
-   // not resolved by fast table, so compute it the slow way
-   // use jpeg approach, which requires MSbits at top
-   k = stbi__bit_reverse(a->code_buffer, 16);
-   for (s=STBI__ZFAST_BITS+1; ; ++s)
-      if (k < z->maxcode[s])
-         break;
-   if (s >= 16) return -1; // invalid code!
-   // code size is s, so:
-   b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
-   if (b >= STBI__ZNSYMS) return -1; // some data was corrupt somewhere!
-   if (z->size[b] != s) return -1;  // was originally an assert, but report failure instead.
-   a->code_buffer >>= s;
-   a->num_bits -= s;
-   return z->value[b];
-}
-
-stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
-{
-   int b,s;
-   if (a->num_bits < 16) {
-      if (stbi__zeof(a)) {
-         return -1;   /* report error for unexpected end of data. */
-      }
-      stbi__fill_bits(a);
-   }
-   b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
-   if (b) {
-      s = b >> 9;
-      a->code_buffer >>= s;
-      a->num_bits -= s;
-      return b & 511;
-   }
-   return stbi__zhuffman_decode_slowpath(a, z);
-}
-
-static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
-{
-   char *q;
-   unsigned int cur, limit, old_limit;
-   z->zout = zout;
-   if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
-   cur   = (unsigned int) (z->zout - z->zout_start);
-   limit = old_limit = (unsigned) (z->zout_end - z->zout_start);
-   if (UINT_MAX - cur < (unsigned) n) return stbi__err("outofmem", "Out of memory");
-   while (cur + n > limit) {
-      if(limit > UINT_MAX / 2) return stbi__err("outofmem", "Out of memory");
-      limit *= 2;
-   }
-   q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
-   STBI_NOTUSED(old_limit);
-   if (q == NULL) return stbi__err("outofmem", "Out of memory");
-   z->zout_start = q;
-   z->zout       = q + cur;
-   z->zout_end   = q + limit;
-   return 1;
-}
-
-static const int stbi__zlength_base[31] = {
-   3,4,5,6,7,8,9,10,11,13,
-   15,17,19,23,27,31,35,43,51,59,
-   67,83,99,115,131,163,195,227,258,0,0 };
-
-static const int stbi__zlength_extra[31]=
-{ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
-
-static const int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
-257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
-
-static const int stbi__zdist_extra[32] =
-{ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
-
-static int stbi__parse_huffman_block(stbi__zbuf *a)
-{
-   char *zout = a->zout;
-   for(;;) {
-      int z = stbi__zhuffman_decode(a, &a->z_length);
-      if (z < 256) {
-         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG"); // error in huffman codes
-         if (zout >= a->zout_end) {
-            if (!stbi__zexpand(a, zout, 1)) return 0;
-            zout = a->zout;
-         }
-         *zout++ = (char) z;
-      } else {
-         stbi_uc *p;
-         int len,dist;
-         if (z == 256) {
-            a->zout = zout;
-            return 1;
-         }
-         if (z >= 286) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data
-         z -= 257;
-         len = stbi__zlength_base[z];
-         if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
-         z = stbi__zhuffman_decode(a, &a->z_distance);
-         if (z < 0 || z >= 30) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, distance codes 30 and 31 must not appear in compressed data
-         dist = stbi__zdist_base[z];
-         if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
-         if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
-         if (zout + len > a->zout_end) {
-            if (!stbi__zexpand(a, zout, len)) return 0;
-            zout = a->zout;
-         }
-         p = (stbi_uc *) (zout - dist);
-         if (dist == 1) { // run of one byte; common in images.
-            stbi_uc v = *p;
-            if (len) { do *zout++ = v; while (--len); }
-         } else {
-            if (len) { do *zout++ = *p++; while (--len); }
-         }
-      }
-   }
-}
-
-static int stbi__compute_huffman_codes(stbi__zbuf *a)
-{
-   static const stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
-   stbi__zhuffman z_codelength;
-   stbi_uc lencodes[286+32+137];//padding for maximum single op
-   stbi_uc codelength_sizes[19];
-   int i,n;
-
-   int hlit  = stbi__zreceive(a,5) + 257;
-   int hdist = stbi__zreceive(a,5) + 1;
-   int hclen = stbi__zreceive(a,4) + 4;
-   int ntot  = hlit + hdist;
-
-   memset(codelength_sizes, 0, sizeof(codelength_sizes));
-   for (i=0; i < hclen; ++i) {
-      int s = stbi__zreceive(a,3);
-      codelength_sizes[length_dezigzag[i]] = (stbi_uc) s;
-   }
-   if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
-
-   n = 0;
-   while (n < ntot) {
-      int c = stbi__zhuffman_decode(a, &z_codelength);
-      if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG");
-      if (c < 16)
-         lencodes[n++] = (stbi_uc) c;
-      else {
-         stbi_uc fill = 0;
-         if (c == 16) {
-            c = stbi__zreceive(a,2)+3;
-            if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG");
-            fill = lencodes[n-1];
-         } else if (c == 17) {
-            c = stbi__zreceive(a,3)+3;
-         } else if (c == 18) {
-            c = stbi__zreceive(a,7)+11;
-         } else {
-            return stbi__err("bad codelengths", "Corrupt PNG");
-         }
-         if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG");
-         memset(lencodes+n, fill, c);
-         n += c;
-      }
-   }
-   if (n != ntot) return stbi__err("bad codelengths","Corrupt PNG");
-   if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
-   if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
-   return 1;
-}
-
-static int stbi__parse_uncompressed_block(stbi__zbuf *a)
-{
-   stbi_uc header[4];
-   int len,nlen,k;
-   if (a->num_bits & 7)
-      stbi__zreceive(a, a->num_bits & 7); // discard
-   // drain the bit-packed data into header
-   k = 0;
-   while (a->num_bits > 0) {
-      header[k++] = (stbi_uc) (a->code_buffer & 255); // suppress MSVC run-time check
-      a->code_buffer >>= 8;
-      a->num_bits -= 8;
-   }
-   if (a->num_bits < 0) return stbi__err("zlib corrupt","Corrupt PNG");
-   // now fill header the normal way
-   while (k < 4)
-      header[k++] = stbi__zget8(a);
-   len  = header[1] * 256 + header[0];
-   nlen = header[3] * 256 + header[2];
-   if (nlen != (len ^ 0xffff)) return stbi__err("zlib corrupt","Corrupt PNG");
-   if (a->zbuffer + len > a->zbuffer_end) return stbi__err("read past buffer","Corrupt PNG");
-   if (a->zout + len > a->zout_end)
-      if (!stbi__zexpand(a, a->zout, len)) return 0;
-   memcpy(a->zout, a->zbuffer, len);
-   a->zbuffer += len;
-   a->zout += len;
-   return 1;
-}
-
-static int stbi__parse_zlib_header(stbi__zbuf *a)
-{
-   int cmf   = stbi__zget8(a);
-   int cm    = cmf & 15;
-   /* int cinfo = cmf >> 4; */
-   int flg   = stbi__zget8(a);
-   if (stbi__zeof(a)) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
-   if ((cmf*256+flg) % 31 != 0) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
-   if (flg & 32) return stbi__err("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png
-   if (cm != 8) return stbi__err("bad compression","Corrupt PNG"); // DEFLATE required for png
-   // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
-   return 1;
-}
-
-static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] =
-{
-   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
-   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
-   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
-   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
-   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
-   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
-   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
-   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
-   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
-};
-static const stbi_uc stbi__zdefault_distance[32] =
-{
-   5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
-};
-/*
-Init algorithm:
-{
-   int i;   // use <= to match clearly with spec
-   for (i=0; i <= 143; ++i)     stbi__zdefault_length[i]   = 8;
-   for (   ; i <= 255; ++i)     stbi__zdefault_length[i]   = 9;
-   for (   ; i <= 279; ++i)     stbi__zdefault_length[i]   = 7;
-   for (   ; i <= 287; ++i)     stbi__zdefault_length[i]   = 8;
-
-   for (i=0; i <=  31; ++i)     stbi__zdefault_distance[i] = 5;
-}
-*/
-
-static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
-{
-   int final, type;
-   if (parse_header)
-      if (!stbi__parse_zlib_header(a)) return 0;
-   a->num_bits = 0;
-   a->code_buffer = 0;
-   do {
-      final = stbi__zreceive(a,1);
-      type = stbi__zreceive(a,2);
-      if (type == 0) {
-         if (!stbi__parse_uncompressed_block(a)) return 0;
-      } else if (type == 3) {
-         return 0;
-      } else {
-         if (type == 1) {
-            // use fixed code lengths
-            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , STBI__ZNSYMS)) return 0;
-            if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
-         } else {
-            if (!stbi__compute_huffman_codes(a)) return 0;
-         }
-         if (!stbi__parse_huffman_block(a)) return 0;
-      }
-   } while (!final);
-   return 1;
-}
-
-static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp, int parse_header)
-{
-   a->zout_start = obuf;
-   a->zout       = obuf;
-   a->zout_end   = obuf + olen;
-   a->z_expandable = exp;
-
-   return stbi__parse_zlib(a, parse_header);
-}
-
-STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen)
-{
-   stbi__zbuf a;
-   char *p = (char *) stbi__malloc(initial_size);
-   if (p == NULL) return NULL;
-   a.zbuffer = (stbi_uc *) buffer;
-   a.zbuffer_end = (stbi_uc *) buffer + len;
-   if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
-      if (outlen) *outlen = (int) (a.zout - a.zout_start);
-      return a.zout_start;
-   } else {
-      STBI_FREE(a.zout_start);
-      return NULL;
-   }
-}
-
-STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len, int *outlen)
-{
-   return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
-}
-
-STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header)
-{
-   stbi__zbuf a;
-   char *p = (char *) stbi__malloc(initial_size);
-   if (p == NULL) return NULL;
-   a.zbuffer = (stbi_uc *) buffer;
-   a.zbuffer_end = (stbi_uc *) buffer + len;
-   if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
-      if (outlen) *outlen = (int) (a.zout - a.zout_start);
-      return a.zout_start;
-   } else {
-      STBI_FREE(a.zout_start);
-      return NULL;
-   }
-}
-
-STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, char const *ibuffer, int ilen)
-{
-   stbi__zbuf a;
-   a.zbuffer = (stbi_uc *) ibuffer;
-   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
-   if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
-      return (int) (a.zout - a.zout_start);
-   else
-      return -1;
-}
-
-STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int *outlen)
-{
-   stbi__zbuf a;
-   char *p = (char *) stbi__malloc(16384);
-   if (p == NULL) return NULL;
-   a.zbuffer = (stbi_uc *) buffer;
-   a.zbuffer_end = (stbi_uc *) buffer+len;
-   if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
-      if (outlen) *outlen = (int) (a.zout - a.zout_start);
-      return a.zout_start;
-   } else {
-      STBI_FREE(a.zout_start);
-      return NULL;
-   }
-}
-
-STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen)
-{
-   stbi__zbuf a;
-   a.zbuffer = (stbi_uc *) ibuffer;
-   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
-   if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
-      return (int) (a.zout - a.zout_start);
-   else
-      return -1;
-}
-#endif
-
-// public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
-//    simple implementation
-//      - only 8-bit samples
-//      - no CRC checking
-//      - allocates lots of intermediate memory
-//        - avoids problem of streaming data between subsystems
-//        - avoids explicit window management
-//    performance
-//      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
-
-#ifndef STBI_NO_PNG
-typedef struct
-{
-   stbi__uint32 length;
-   stbi__uint32 type;
-} stbi__pngchunk;
-
-static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
-{
-   stbi__pngchunk c;
-   c.length = stbi__get32be(s);
-   c.type   = stbi__get32be(s);
-   return c;
-}
-
-static int stbi__check_png_header(stbi__context *s)
-{
-   static const stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
-   int i;
-   for (i=0; i < 8; ++i)
-      if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG");
-   return 1;
-}
-
-typedef struct
-{
-   stbi__context *s;
-   stbi_uc *idata, *expanded, *out;
-   int depth;
-} stbi__png;
-
-
-enum {
-   STBI__F_none=0,
-   STBI__F_sub=1,
-   STBI__F_up=2,
-   STBI__F_avg=3,
-   STBI__F_paeth=4,
-   // synthetic filters used for first scanline to avoid needing a dummy row of 0s
-   STBI__F_avg_first,
-   STBI__F_paeth_first
-};
-
-static stbi_uc first_row_filter[5] =
-{
-   STBI__F_none,
-   STBI__F_sub,
-   STBI__F_none,
-   STBI__F_avg_first,
-   STBI__F_paeth_first
-};
-
-static int stbi__paeth(int a, int b, int c)
-{
-   int p = a + b - c;
-   int pa = abs(p-a);
-   int pb = abs(p-b);
-   int pc = abs(p-c);
-   if (pa <= pb && pa <= pc) return a;
-   if (pb <= pc) return b;
-   return c;
-}
-
-static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
-
-// create the png data from post-deflated data
-static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
-{
-   int bytes = (depth == 16? 2 : 1);
-   stbi__context *s = a->s;
-   stbi__uint32 i,j,stride = x*out_n*bytes;
-   stbi__uint32 img_len, img_width_bytes;
-   int k;
-   int img_n = s->img_n; // copy it into a local for later
-
-   int output_bytes = out_n*bytes;
-   int filter_bytes = img_n*bytes;
-   int width = x;
-
-   STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
-   a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
-   if (!a->out) return stbi__err("outofmem", "Out of memory");
-
-   if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG");
-   img_width_bytes = (((img_n * x * depth) + 7) >> 3);
-   img_len = (img_width_bytes + 1) * y;
-
-   // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
-   // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
-   // so just check for raw_len < img_len always.
-   if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
-
-   for (j=0; j < y; ++j) {
-      stbi_uc *cur = a->out + stride*j;
-      stbi_uc *prior;
-      int filter = *raw++;
-
-      if (filter > 4)
-         return stbi__err("invalid filter","Corrupt PNG");
-
-      if (depth < 8) {
-         if (img_width_bytes > x) return stbi__err("invalid width","Corrupt PNG");
-         cur += x*out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
-         filter_bytes = 1;
-         width = img_width_bytes;
-      }
-      prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
-
-      // if first row, use special filter that doesn't sample previous row
-      if (j == 0) filter = first_row_filter[filter];
-
-      // handle first byte explicitly
-      for (k=0; k < filter_bytes; ++k) {
-         switch (filter) {
-            case STBI__F_none       : cur[k] = raw[k]; break;
-            case STBI__F_sub        : cur[k] = raw[k]; break;
-            case STBI__F_up         : cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
-            case STBI__F_avg        : cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1)); break;
-            case STBI__F_paeth      : cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0,prior[k],0)); break;
-            case STBI__F_avg_first  : cur[k] = raw[k]; break;
-            case STBI__F_paeth_first: cur[k] = raw[k]; break;
-         }
-      }
-
-      if (depth == 8) {
-         if (img_n != out_n)
-            cur[img_n] = 255; // first pixel
-         raw += img_n;
-         cur += out_n;
-         prior += out_n;
-      } else if (depth == 16) {
-         if (img_n != out_n) {
-            cur[filter_bytes]   = 255; // first pixel top byte
-            cur[filter_bytes+1] = 255; // first pixel bottom byte
-         }
-         raw += filter_bytes;
-         cur += output_bytes;
-         prior += output_bytes;
-      } else {
-         raw += 1;
-         cur += 1;
-         prior += 1;
-      }
-
-      // this is a little gross, so that we don't switch per-pixel or per-component
-      if (depth < 8 || img_n == out_n) {
-         int nk = (width - 1)*filter_bytes;
-         #define STBI__CASE(f) \
-             case f:     \
-                for (k=0; k < nk; ++k)
-         switch (filter) {
-            // "none" filter turns into a memcpy here; make that explicit.
-            case STBI__F_none:         memcpy(cur, raw, nk); break;
-            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); } break;
-            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
-            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); } break;
-            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); } break;
-            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); } break;
-            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); } break;
-         }
-         #undef STBI__CASE
-         raw += nk;
-      } else {
-         STBI_ASSERT(img_n+1 == out_n);
-         #define STBI__CASE(f) \
-             case f:     \
-                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
-                   for (k=0; k < filter_bytes; ++k)
-         switch (filter) {
-            STBI__CASE(STBI__F_none)         { cur[k] = raw[k]; } break;
-            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); } break;
-            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
-            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); } break;
-            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); } break;
-            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); } break;
-            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],0,0)); } break;
-         }
-         #undef STBI__CASE
-
-         // the loop above sets the high byte of the pixels' alpha, but for
-         // 16 bit png files we also need the low byte set. we'll do that here.
-         if (depth == 16) {
-            cur = a->out + stride*j; // start at the beginning of the row again
-            for (i=0; i < x; ++i,cur+=output_bytes) {
-               cur[filter_bytes+1] = 255;
-            }
-         }
-      }
-   }
-
-   // we make a separate pass to expand bits to pixels; for performance,
-   // this could run two scanlines behind the above code, so it won't
-   // intefere with filtering but will still be in the cache.
-   if (depth < 8) {
-      for (j=0; j < y; ++j) {
-         stbi_uc *cur = a->out + stride*j;
-         stbi_uc *in  = a->out + stride*j + x*out_n - img_width_bytes;
-         // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit
-         // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop
-         stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
-
-         // note that the final byte might overshoot and write more data than desired.
-         // we can allocate enough data that this never writes out of memory, but it
-         // could also overwrite the next scanline. can it overwrite non-empty data
-         // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
-         // so we need to explicitly clamp the final ones
-
-         if (depth == 4) {
-            for (k=x*img_n; k >= 2; k-=2, ++in) {
-               *cur++ = scale * ((*in >> 4)       );
-               *cur++ = scale * ((*in     ) & 0x0f);
-            }
-            if (k > 0) *cur++ = scale * ((*in >> 4)       );
-         } else if (depth == 2) {
-            for (k=x*img_n; k >= 4; k-=4, ++in) {
-               *cur++ = scale * ((*in >> 6)       );
-               *cur++ = scale * ((*in >> 4) & 0x03);
-               *cur++ = scale * ((*in >> 2) & 0x03);
-               *cur++ = scale * ((*in     ) & 0x03);
-            }
-            if (k > 0) *cur++ = scale * ((*in >> 6)       );
-            if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03);
-            if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03);
-         } else if (depth == 1) {
-            for (k=x*img_n; k >= 8; k-=8, ++in) {
-               *cur++ = scale * ((*in >> 7)       );
-               *cur++ = scale * ((*in >> 6) & 0x01);
-               *cur++ = scale * ((*in >> 5) & 0x01);
-               *cur++ = scale * ((*in >> 4) & 0x01);
-               *cur++ = scale * ((*in >> 3) & 0x01);
-               *cur++ = scale * ((*in >> 2) & 0x01);
-               *cur++ = scale * ((*in >> 1) & 0x01);
-               *cur++ = scale * ((*in     ) & 0x01);
-            }
-            if (k > 0) *cur++ = scale * ((*in >> 7)       );
-            if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
-            if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
-            if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
-            if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
-            if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
-            if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
-         }
-         if (img_n != out_n) {
-            int q;
-            // insert alpha = 255
-            cur = a->out + stride*j;
-            if (img_n == 1) {
-               for (q=x-1; q >= 0; --q) {
-                  cur[q*2+1] = 255;
-                  cur[q*2+0] = cur[q];
-               }
-            } else {
-               STBI_ASSERT(img_n == 3);
-               for (q=x-1; q >= 0; --q) {
-                  cur[q*4+3] = 255;
-                  cur[q*4+2] = cur[q*3+2];
-                  cur[q*4+1] = cur[q*3+1];
-                  cur[q*4+0] = cur[q*3+0];
-               }
-            }
-         }
-      }
-   } else if (depth == 16) {
-      // force the image data from big-endian to platform-native.
-      // this is done in a separate pass due to the decoding relying
-      // on the data being untouched, but could probably be done
-      // per-line during decode if care is taken.
-      stbi_uc *cur = a->out;
-      stbi__uint16 *cur16 = (stbi__uint16*)cur;
-
-      for(i=0; i < x*y*out_n; ++i,cur16++,cur+=2) {
-         *cur16 = (cur[0] << 8) | cur[1];
-      }
-   }
-
-   return 1;
-}
-
-static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
-{
-   int bytes = (depth == 16 ? 2 : 1);
-   int out_bytes = out_n * bytes;
-   stbi_uc *final;
-   int p;
-   if (!interlaced)
-      return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
-
-   // de-interlacing
-   final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
-   if (!final) return stbi__err("outofmem", "Out of memory");
-   for (p=0; p < 7; ++p) {
-      int xorig[] = { 0,4,0,2,0,1,0 };
-      int yorig[] = { 0,0,4,0,2,0,1 };
-      int xspc[]  = { 8,8,4,4,2,2,1 };
-      int yspc[]  = { 8,8,8,4,4,2,2 };
-      int i,j,x,y;
-      // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
-      x = (a->s->img_x - xorig[p] + xspc[p]-1) / xspc[p];
-      y = (a->s->img_y - yorig[p] + yspc[p]-1) / yspc[p];
-      if (x && y) {
-         stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
-         if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
-            STBI_FREE(final);
-            return 0;
-         }
-         for (j=0; j < y; ++j) {
-            for (i=0; i < x; ++i) {
-               int out_y = j*yspc[p]+yorig[p];
-               int out_x = i*xspc[p]+xorig[p];
-               memcpy(final + out_y*a->s->img_x*out_bytes + out_x*out_bytes,
-                      a->out + (j*x+i)*out_bytes, out_bytes);
-            }
-         }
-         STBI_FREE(a->out);
-         image_data += img_len;
-         image_data_len -= img_len;
-      }
-   }
-   a->out = final;
-
-   return 1;
-}
-
-static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n)
-{
-   stbi__context *s = z->s;
-   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
-   stbi_uc *p = z->out;
-
-   // compute color-based transparency, assuming we've
-   // already got 255 as the alpha value in the output
-   STBI_ASSERT(out_n == 2 || out_n == 4);
-
-   if (out_n == 2) {
-      for (i=0; i < pixel_count; ++i) {
-         p[1] = (p[0] == tc[0] ? 0 : 255);
-         p += 2;
-      }
-   } else {
-      for (i=0; i < pixel_count; ++i) {
-         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
-            p[3] = 0;
-         p += 4;
-      }
-   }
-   return 1;
-}
-
-static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n)
-{
-   stbi__context *s = z->s;
-   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
-   stbi__uint16 *p = (stbi__uint16*) z->out;
-
-   // compute color-based transparency, assuming we've
-   // already got 65535 as the alpha value in the output
-   STBI_ASSERT(out_n == 2 || out_n == 4);
-
-   if (out_n == 2) {
-      for (i = 0; i < pixel_count; ++i) {
-         p[1] = (p[0] == tc[0] ? 0 : 65535);
-         p += 2;
-      }
-   } else {
-      for (i = 0; i < pixel_count; ++i) {
-         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
-            p[3] = 0;
-         p += 4;
-      }
-   }
-   return 1;
-}
-
-static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n)
-{
-   stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
-   stbi_uc *p, *temp_out, *orig = a->out;
-
-   p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0);
-   if (p == NULL) return stbi__err("outofmem", "Out of memory");
-
-   // between here and free(out) below, exitting would leak
-   temp_out = p;
-
-   if (pal_img_n == 3) {
-      for (i=0; i < pixel_count; ++i) {
-         int n = orig[i]*4;
-         p[0] = palette[n  ];
-         p[1] = palette[n+1];
-         p[2] = palette[n+2];
-         p += 3;
-      }
-   } else {
-      for (i=0; i < pixel_count; ++i) {
-         int n = orig[i]*4;
-         p[0] = palette[n  ];
-         p[1] = palette[n+1];
-         p[2] = palette[n+2];
-         p[3] = palette[n+3];
-         p += 4;
-      }
-   }
-   STBI_FREE(a->out);
-   a->out = temp_out;
-
-   STBI_NOTUSED(len);
-
-   return 1;
-}
-
-static int stbi__unpremultiply_on_load_global = 0;
-static int stbi__de_iphone_flag_global = 0;
-
-STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
-{
-   stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply;
-}
-
-STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
-{
-   stbi__de_iphone_flag_global = flag_true_if_should_convert;
-}
-
-#ifndef STBI_THREAD_LOCAL
-#define stbi__unpremultiply_on_load  stbi__unpremultiply_on_load_global
-#define stbi__de_iphone_flag  stbi__de_iphone_flag_global
-#else
-static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set;
-static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set;
-
-STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
-{
-   stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
-   stbi__unpremultiply_on_load_set = 1;
-}
-
-STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert)
-{
-   stbi__de_iphone_flag_local = flag_true_if_should_convert;
-   stbi__de_iphone_flag_set = 1;
-}
-
-#define stbi__unpremultiply_on_load  (stbi__unpremultiply_on_load_set           \
-                                       ? stbi__unpremultiply_on_load_local      \
-                                       : stbi__unpremultiply_on_load_global)
-#define stbi__de_iphone_flag  (stbi__de_iphone_flag_set                         \
-                                ? stbi__de_iphone_flag_local                    \
-                                : stbi__de_iphone_flag_global)
-#endif // STBI_THREAD_LOCAL
-
-static void stbi__de_iphone(stbi__png *z)
-{
-   stbi__context *s = z->s;
-   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
-   stbi_uc *p = z->out;
-
-   if (s->img_out_n == 3) {  // convert bgr to rgb
-      for (i=0; i < pixel_count; ++i) {
-         stbi_uc t = p[0];
-         p[0] = p[2];
-         p[2] = t;
-         p += 3;
-      }
-   } else {
-      STBI_ASSERT(s->img_out_n == 4);
-      if (stbi__unpremultiply_on_load) {
-         // convert bgr to rgb and unpremultiply
-         for (i=0; i < pixel_count; ++i) {
-            stbi_uc a = p[3];
-            stbi_uc t = p[0];
-            if (a) {
-               stbi_uc half = a / 2;
-               p[0] = (p[2] * 255 + half) / a;
-               p[1] = (p[1] * 255 + half) / a;
-               p[2] = ( t   * 255 + half) / a;
-            } else {
-               p[0] = p[2];
-               p[2] = t;
-            }
-            p += 4;
-         }
-      } else {
-         // convert bgr to rgb
-         for (i=0; i < pixel_count; ++i) {
-            stbi_uc t = p[0];
-            p[0] = p[2];
-            p[2] = t;
-            p += 4;
-         }
-      }
-   }
-}
-
-#define STBI__PNG_TYPE(a,b,c,d)  (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d))
-
-static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
-{
-   stbi_uc palette[1024], pal_img_n=0;
-   stbi_uc has_trans=0, tc[3]={0};
-   stbi__uint16 tc16[3];
-   stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
-   int first=1,k,interlace=0, color=0, is_iphone=0;
-   stbi__context *s = z->s;
-
-   z->expanded = NULL;
-   z->idata = NULL;
-   z->out = NULL;
-
-   if (!stbi__check_png_header(s)) return 0;
-
-   if (scan == STBI__SCAN_type) return 1;
-
-   for (;;) {
-      stbi__pngchunk c = stbi__get_chunk_header(s);
-      switch (c.type) {
-         case STBI__PNG_TYPE('C','g','B','I'):
-            is_iphone = 1;
-            stbi__skip(s, c.length);
-            break;
-         case STBI__PNG_TYPE('I','H','D','R'): {
-            int comp,filter;
-            if (!first) return stbi__err("multiple IHDR","Corrupt PNG");
-            first = 0;
-            if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
-            s->img_x = stbi__get32be(s);
-            s->img_y = stbi__get32be(s);
-            if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
-            if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
-            z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
-            color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
-            if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
-            if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
-            comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
-            filter= stbi__get8(s);  if (filter) return stbi__err("bad filter method","Corrupt PNG");
-            interlace = stbi__get8(s); if (interlace>1) return stbi__err("bad interlace method","Corrupt PNG");
-            if (!s->img_x || !s->img_y) return stbi__err("0-pixel image","Corrupt PNG");
-            if (!pal_img_n) {
-               s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
-               if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
-            } else {
-               // if paletted, then pal_n is our final components, and
-               // img_n is # components to decompress/filter.
-               s->img_n = 1;
-               if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
-            }
-            // even with SCAN_header, have to scan to see if we have a tRNS
-            break;
-         }
-
-         case STBI__PNG_TYPE('P','L','T','E'):  {
-            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
-            if (c.length > 256*3) return stbi__err("invalid PLTE","Corrupt PNG");
-            pal_len = c.length / 3;
-            if (pal_len * 3 != c.length) return stbi__err("invalid PLTE","Corrupt PNG");
-            for (i=0; i < pal_len; ++i) {
-               palette[i*4+0] = stbi__get8(s);
-               palette[i*4+1] = stbi__get8(s);
-               palette[i*4+2] = stbi__get8(s);
-               palette[i*4+3] = 255;
-            }
-            break;
-         }
-
-         case STBI__PNG_TYPE('t','R','N','S'): {
-            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
-            if (z->idata) return stbi__err("tRNS after IDAT","Corrupt PNG");
-            if (pal_img_n) {
-               if (scan == STBI__SCAN_header) { s->img_n = 4; return 1; }
-               if (pal_len == 0) return stbi__err("tRNS before PLTE","Corrupt PNG");
-               if (c.length > pal_len) return stbi__err("bad tRNS len","Corrupt PNG");
-               pal_img_n = 4;
-               for (i=0; i < c.length; ++i)
-                  palette[i*4+3] = stbi__get8(s);
-            } else {
-               if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
-               if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
-               has_trans = 1;
-               // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now.
-               if (scan == STBI__SCAN_header) { ++s->img_n; return 1; }
-               if (z->depth == 16) {
-                  for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
-               } else {
-                  for (k = 0; k < s->img_n; ++k) tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
-               }
-            }
-            break;
-         }
-
-         case STBI__PNG_TYPE('I','D','A','T'): {
-            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
-            if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
-            if (scan == STBI__SCAN_header) {
-               // header scan definitely stops at first IDAT
-               if (pal_img_n)
-                  s->img_n = pal_img_n;
-               return 1;
-            }
-            if (c.length > (1u << 30)) return stbi__err("IDAT size limit", "IDAT section larger than 2^30 bytes");
-            if ((int)(ioff + c.length) < (int)ioff) return 0;
-            if (ioff + c.length > idata_limit) {
-               stbi__uint32 idata_limit_old = idata_limit;
-               stbi_uc *p;
-               if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
-               while (ioff + c.length > idata_limit)
-                  idata_limit *= 2;
-               STBI_NOTUSED(idata_limit_old);
-               p = (stbi_uc *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
-               z->idata = p;
-            }
-            if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG");
-            ioff += c.length;
-            break;
-         }
-
-         case STBI__PNG_TYPE('I','E','N','D'): {
-            stbi__uint32 raw_len, bpl;
-            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
-            if (scan != STBI__SCAN_load) return 1;
-            if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG");
-            // initial guess for decoded data size to avoid unnecessary reallocs
-            bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
-            raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
-            z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone);
-            if (z->expanded == NULL) return 0; // zlib should set error
-            STBI_FREE(z->idata); z->idata = NULL;
-            if ((req_comp == s->img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
-               s->img_out_n = s->img_n+1;
-            else
-               s->img_out_n = s->img_n;
-            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
-            if (has_trans) {
-               if (z->depth == 16) {
-                  if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0;
-               } else {
-                  if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
-               }
-            }
-            if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
-               stbi__de_iphone(z);
-            if (pal_img_n) {
-               // pal_img_n == 3 or 4
-               s->img_n = pal_img_n; // record the actual colors we had
-               s->img_out_n = pal_img_n;
-               if (req_comp >= 3) s->img_out_n = req_comp;
-               if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
-                  return 0;
-            } else if (has_trans) {
-               // non-paletted image with tRNS -> source image has (constant) alpha
-               ++s->img_n;
-            }
-            STBI_FREE(z->expanded); z->expanded = NULL;
-            // end of PNG chunk, read and skip CRC
-            stbi__get32be(s);
-            return 1;
-         }
-
-         default:
-            // if critical, fail
-            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
-            if ((c.type & (1 << 29)) == 0) {
-               #ifndef STBI_NO_FAILURE_STRINGS
-               // not threadsafe
-               static char invalid_chunk[] = "XXXX PNG chunk not known";
-               invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
-               invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
-               invalid_chunk[2] = STBI__BYTECAST(c.type >>  8);
-               invalid_chunk[3] = STBI__BYTECAST(c.type >>  0);
-               #endif
-               return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
-            }
-            stbi__skip(s, c.length);
-            break;
-      }
-      // end of PNG chunk, read and skip CRC
-      stbi__get32be(s);
-   }
-}
-
-static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, stbi__result_info *ri)
-{
-   void *result=NULL;
-   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
-   if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
-      if (p->depth <= 8)
-         ri->bits_per_channel = 8;
-      else if (p->depth == 16)
-         ri->bits_per_channel = 16;
-      else
-         return stbi__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth");
-      result = p->out;
-      p->out = NULL;
-      if (req_comp && req_comp != p->s->img_out_n) {
-         if (ri->bits_per_channel == 8)
-            result = stbi__convert_format((unsigned char *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
-         else
-            result = stbi__convert_format16((stbi__uint16 *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
-         p->s->img_out_n = req_comp;
-         if (result == NULL) return result;
-      }
-      *x = p->s->img_x;
-      *y = p->s->img_y;
-      if (n) *n = p->s->img_n;
-   }
-   STBI_FREE(p->out);      p->out      = NULL;
-   STBI_FREE(p->expanded); p->expanded = NULL;
-   STBI_FREE(p->idata);    p->idata    = NULL;
-
-   return result;
-}
-
-static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
-{
-   stbi__png p;
-   p.s = s;
-   return stbi__do_png(&p, x,y,comp,req_comp, ri);
-}
-
-static int stbi__png_test(stbi__context *s)
-{
-   int r;
-   r = stbi__check_png_header(s);
-   stbi__rewind(s);
-   return r;
-}
-
-static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp)
-{
-   if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
-      stbi__rewind( p->s );
-      return 0;
-   }
-   if (x) *x = p->s->img_x;
-   if (y) *y = p->s->img_y;
-   if (comp) *comp = p->s->img_n;
-   return 1;
-}
-
-static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   stbi__png p;
-   p.s = s;
-   return stbi__png_info_raw(&p, x, y, comp);
-}
-
-static int stbi__png_is16(stbi__context *s)
-{
-   stbi__png p;
-   p.s = s;
-   if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
-	   return 0;
-   if (p.depth != 16) {
-      stbi__rewind(p.s);
-      return 0;
-   }
-   return 1;
-}
-#endif
-
-// Microsoft/Windows BMP image
-
-#ifndef STBI_NO_BMP
-static int stbi__bmp_test_raw(stbi__context *s)
-{
-   int r;
-   int sz;
-   if (stbi__get8(s) != 'B') return 0;
-   if (stbi__get8(s) != 'M') return 0;
-   stbi__get32le(s); // discard filesize
-   stbi__get16le(s); // discard reserved
-   stbi__get16le(s); // discard reserved
-   stbi__get32le(s); // discard data offset
-   sz = stbi__get32le(s);
-   r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
-   return r;
-}
-
-static int stbi__bmp_test(stbi__context *s)
-{
-   int r = stbi__bmp_test_raw(s);
-   stbi__rewind(s);
-   return r;
-}
-
-
-// returns 0..31 for the highest set bit
-static int stbi__high_bit(unsigned int z)
-{
-   int n=0;
-   if (z == 0) return -1;
-   if (z >= 0x10000) { n += 16; z >>= 16; }
-   if (z >= 0x00100) { n +=  8; z >>=  8; }
-   if (z >= 0x00010) { n +=  4; z >>=  4; }
-   if (z >= 0x00004) { n +=  2; z >>=  2; }
-   if (z >= 0x00002) { n +=  1;/* >>=  1;*/ }
-   return n;
-}
-
-static int stbi__bitcount(unsigned int a)
-{
-   a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
-   a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
-   a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
-   a = (a + (a >> 8)); // max 16 per 8 bits
-   a = (a + (a >> 16)); // max 32 per 8 bits
-   return a & 0xff;
-}
-
-// extract an arbitrarily-aligned N-bit value (N=bits)
-// from v, and then make it 8-bits long and fractionally
-// extend it to full full range.
-static int stbi__shiftsigned(unsigned int v, int shift, int bits)
-{
-   static unsigned int mul_table[9] = {
-      0,
-      0xff/*0b11111111*/, 0x55/*0b01010101*/, 0x49/*0b01001001*/, 0x11/*0b00010001*/,
-      0x21/*0b00100001*/, 0x41/*0b01000001*/, 0x81/*0b10000001*/, 0x01/*0b00000001*/,
-   };
-   static unsigned int shift_table[9] = {
-      0, 0,0,1,0,2,4,6,0,
-   };
-   if (shift < 0)
-      v <<= -shift;
-   else
-      v >>= shift;
-   STBI_ASSERT(v < 256);
-   v >>= (8-bits);
-   STBI_ASSERT(bits >= 0 && bits <= 8);
-   return (int) ((unsigned) v * mul_table[bits]) >> shift_table[bits];
-}
-
-typedef struct
-{
-   int bpp, offset, hsz;
-   unsigned int mr,mg,mb,ma, all_a;
-   int extra_read;
-} stbi__bmp_data;
-
-static int stbi__bmp_set_mask_defaults(stbi__bmp_data *info, int compress)
-{
-   // BI_BITFIELDS specifies masks explicitly, don't override
-   if (compress == 3)
-      return 1;
-
-   if (compress == 0) {
-      if (info->bpp == 16) {
-         info->mr = 31u << 10;
-         info->mg = 31u <<  5;
-         info->mb = 31u <<  0;
-      } else if (info->bpp == 32) {
-         info->mr = 0xffu << 16;
-         info->mg = 0xffu <<  8;
-         info->mb = 0xffu <<  0;
-         info->ma = 0xffu << 24;
-         info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
-      } else {
-         // otherwise, use defaults, which is all-0
-         info->mr = info->mg = info->mb = info->ma = 0;
-      }
-      return 1;
-   }
-   return 0; // error
-}
-
-static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
-{
-   int hsz;
-   if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP");
-   stbi__get32le(s); // discard filesize
-   stbi__get16le(s); // discard reserved
-   stbi__get16le(s); // discard reserved
-   info->offset = stbi__get32le(s);
-   info->hsz = hsz = stbi__get32le(s);
-   info->mr = info->mg = info->mb = info->ma = 0;
-   info->extra_read = 14;
-
-   if (info->offset < 0) return stbi__errpuc("bad BMP", "bad BMP");
-
-   if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
-   if (hsz == 12) {
-      s->img_x = stbi__get16le(s);
-      s->img_y = stbi__get16le(s);
-   } else {
-      s->img_x = stbi__get32le(s);
-      s->img_y = stbi__get32le(s);
-   }
-   if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP");
-   info->bpp = stbi__get16le(s);
-   if (hsz != 12) {
-      int compress = stbi__get32le(s);
-      if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
-      if (compress >= 4) return stbi__errpuc("BMP JPEG/PNG", "BMP type not supported: unsupported compression"); // this includes PNG/JPEG modes
-      if (compress == 3 && info->bpp != 16 && info->bpp != 32) return stbi__errpuc("bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel
-      stbi__get32le(s); // discard sizeof
-      stbi__get32le(s); // discard hres
-      stbi__get32le(s); // discard vres
-      stbi__get32le(s); // discard colorsused
-      stbi__get32le(s); // discard max important
-      if (hsz == 40 || hsz == 56) {
-         if (hsz == 56) {
-            stbi__get32le(s);
-            stbi__get32le(s);
-            stbi__get32le(s);
-            stbi__get32le(s);
-         }
-         if (info->bpp == 16 || info->bpp == 32) {
-            if (compress == 0) {
-               stbi__bmp_set_mask_defaults(info, compress);
-            } else if (compress == 3) {
-               info->mr = stbi__get32le(s);
-               info->mg = stbi__get32le(s);
-               info->mb = stbi__get32le(s);
-               info->extra_read += 12;
-               // not documented, but generated by photoshop and handled by mspaint
-               if (info->mr == info->mg && info->mg == info->mb) {
-                  // ?!?!?
-                  return stbi__errpuc("bad BMP", "bad BMP");
-               }
-            } else
-               return stbi__errpuc("bad BMP", "bad BMP");
-         }
-      } else {
-         // V4/V5 header
-         int i;
-         if (hsz != 108 && hsz != 124)
-            return stbi__errpuc("bad BMP", "bad BMP");
-         info->mr = stbi__get32le(s);
-         info->mg = stbi__get32le(s);
-         info->mb = stbi__get32le(s);
-         info->ma = stbi__get32le(s);
-         if (compress != 3) // override mr/mg/mb unless in BI_BITFIELDS mode, as per docs
-            stbi__bmp_set_mask_defaults(info, compress);
-         stbi__get32le(s); // discard color space
-         for (i=0; i < 12; ++i)
-            stbi__get32le(s); // discard color space parameters
-         if (hsz == 124) {
-            stbi__get32le(s); // discard rendering intent
-            stbi__get32le(s); // discard offset of profile data
-            stbi__get32le(s); // discard size of profile data
-            stbi__get32le(s); // discard reserved
-         }
-      }
-   }
-   return (void *) 1;
-}
-
-
-static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
-{
-   stbi_uc *out;
-   unsigned int mr=0,mg=0,mb=0,ma=0, all_a;
-   stbi_uc pal[256][4];
-   int psize=0,i,j,width;
-   int flip_vertically, pad, target;
-   stbi__bmp_data info;
-   STBI_NOTUSED(ri);
-
-   info.all_a = 255;
-   if (stbi__bmp_parse_header(s, &info) == NULL)
-      return NULL; // error code already set
-
-   flip_vertically = ((int) s->img_y) > 0;
-   s->img_y = abs((int) s->img_y);
-
-   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
-   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
-
-   mr = info.mr;
-   mg = info.mg;
-   mb = info.mb;
-   ma = info.ma;
-   all_a = info.all_a;
-
-   if (info.hsz == 12) {
-      if (info.bpp < 24)
-         psize = (info.offset - info.extra_read - 24) / 3;
-   } else {
-      if (info.bpp < 16)
-         psize = (info.offset - info.extra_read - info.hsz) >> 2;
-   }
-   if (psize == 0) {
-      // accept some number of extra bytes after the header, but if the offset points either to before
-      // the header ends or implies a large amount of extra data, reject the file as malformed
-      int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original);
-      int header_limit = 1024; // max we actually read is below 256 bytes currently.
-      int extra_data_limit = 256*4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size.
-      if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) {
-         return stbi__errpuc("bad header", "Corrupt BMP");
-      }
-      // we established that bytes_read_so_far is positive and sensible.
-      // the first half of this test rejects offsets that are either too small positives, or
-      // negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn
-      // ensures the number computed in the second half of the test can't overflow.
-      if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) {
-         return stbi__errpuc("bad offset", "Corrupt BMP");
-      } else {
-         stbi__skip(s, info.offset - bytes_read_so_far);
-      }
-   }
-
-   if (info.bpp == 24 && ma == 0xff000000)
-      s->img_n = 3;
-   else
-      s->img_n = ma ? 4 : 3;
-   if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
-      target = req_comp;
-   else
-      target = s->img_n; // if they want monochrome, we'll post-convert
-
-   // sanity-check size
-   if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
-      return stbi__errpuc("too large", "Corrupt BMP");
-
-   out = (stbi_uc *) stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
-   if (!out) return stbi__errpuc("outofmem", "Out of memory");
-   if (info.bpp < 16) {
-      int z=0;
-      if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
-      for (i=0; i < psize; ++i) {
-         pal[i][2] = stbi__get8(s);
-         pal[i][1] = stbi__get8(s);
-         pal[i][0] = stbi__get8(s);
-         if (info.hsz != 12) stbi__get8(s);
-         pal[i][3] = 255;
-      }
-      stbi__skip(s, info.offset - info.extra_read - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
-      if (info.bpp == 1) width = (s->img_x + 7) >> 3;
-      else if (info.bpp == 4) width = (s->img_x + 1) >> 1;
-      else if (info.bpp == 8) width = s->img_x;
-      else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
-      pad = (-width)&3;
-      if (info.bpp == 1) {
-         for (j=0; j < (int) s->img_y; ++j) {
-            int bit_offset = 7, v = stbi__get8(s);
-            for (i=0; i < (int) s->img_x; ++i) {
-               int color = (v>>bit_offset)&0x1;
-               out[z++] = pal[color][0];
-               out[z++] = pal[color][1];
-               out[z++] = pal[color][2];
-               if (target == 4) out[z++] = 255;
-               if (i+1 == (int) s->img_x) break;
-               if((--bit_offset) < 0) {
-                  bit_offset = 7;
-                  v = stbi__get8(s);
-               }
-            }
-            stbi__skip(s, pad);
-         }
-      } else {
-         for (j=0; j < (int) s->img_y; ++j) {
-            for (i=0; i < (int) s->img_x; i += 2) {
-               int v=stbi__get8(s),v2=0;
-               if (info.bpp == 4) {
-                  v2 = v & 15;
-                  v >>= 4;
-               }
-               out[z++] = pal[v][0];
-               out[z++] = pal[v][1];
-               out[z++] = pal[v][2];
-               if (target == 4) out[z++] = 255;
-               if (i+1 == (int) s->img_x) break;
-               v = (info.bpp == 8) ? stbi__get8(s) : v2;
-               out[z++] = pal[v][0];
-               out[z++] = pal[v][1];
-               out[z++] = pal[v][2];
-               if (target == 4) out[z++] = 255;
-            }
-            stbi__skip(s, pad);
-         }
-      }
-   } else {
-      int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
-      int z = 0;
-      int easy=0;
-      stbi__skip(s, info.offset - info.extra_read - info.hsz);
-      if (info.bpp == 24) width = 3 * s->img_x;
-      else if (info.bpp == 16) width = 2*s->img_x;
-      else /* bpp = 32 and pad = 0 */ width=0;
-      pad = (-width) & 3;
-      if (info.bpp == 24) {
-         easy = 1;
-      } else if (info.bpp == 32) {
-         if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
-            easy = 2;
-      }
-      if (!easy) {
-         if (!mr || !mg || !mb) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
-         // right shift amt to put high bit in position #7
-         rshift = stbi__high_bit(mr)-7; rcount = stbi__bitcount(mr);
-         gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg);
-         bshift = stbi__high_bit(mb)-7; bcount = stbi__bitcount(mb);
-         ashift = stbi__high_bit(ma)-7; acount = stbi__bitcount(ma);
-         if (rcount > 8 || gcount > 8 || bcount > 8 || acount > 8) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
-      }
-      for (j=0; j < (int) s->img_y; ++j) {
-         if (easy) {
-            for (i=0; i < (int) s->img_x; ++i) {
-               unsigned char a;
-               out[z+2] = stbi__get8(s);
-               out[z+1] = stbi__get8(s);
-               out[z+0] = stbi__get8(s);
-               z += 3;
-               a = (easy == 2 ? stbi__get8(s) : 255);
-               all_a |= a;
-               if (target == 4) out[z++] = a;
-            }
-         } else {
-            int bpp = info.bpp;
-            for (i=0; i < (int) s->img_x; ++i) {
-               stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s));
-               unsigned int a;
-               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
-               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
-               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
-               a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
-               all_a |= a;
-               if (target == 4) out[z++] = STBI__BYTECAST(a);
-            }
-         }
-         stbi__skip(s, pad);
-      }
-   }
-
-   // if alpha channel is all 0s, replace with all 255s
-   if (target == 4 && all_a == 0)
-      for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4)
-         out[i] = 255;
-
-   if (flip_vertically) {
-      stbi_uc t;
-      for (j=0; j < (int) s->img_y>>1; ++j) {
-         stbi_uc *p1 = out +      j     *s->img_x*target;
-         stbi_uc *p2 = out + (s->img_y-1-j)*s->img_x*target;
-         for (i=0; i < (int) s->img_x*target; ++i) {
-            t = p1[i]; p1[i] = p2[i]; p2[i] = t;
-         }
-      }
-   }
-
-   if (req_comp && req_comp != target) {
-      out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
-      if (out == NULL) return out; // stbi__convert_format frees input on failure
-   }
-
-   *x = s->img_x;
-   *y = s->img_y;
-   if (comp) *comp = s->img_n;
-   return out;
-}
-#endif
-
-// Targa Truevision - TGA
-// by Jonathan Dummer
-#ifndef STBI_NO_TGA
-// returns STBI_rgb or whatever, 0 on error
-static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
-{
-   // only RGB or RGBA (incl. 16bit) or grey allowed
-   if (is_rgb16) *is_rgb16 = 0;
-   switch(bits_per_pixel) {
-      case 8:  return STBI_grey;
-      case 16: if(is_grey) return STBI_grey_alpha;
-               // fallthrough
-      case 15: if(is_rgb16) *is_rgb16 = 1;
-               return STBI_rgb;
-      case 24: // fallthrough
-      case 32: return bits_per_pixel/8;
-      default: return 0;
-   }
-}
-
-static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
-{
-    int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
-    int sz, tga_colormap_type;
-    stbi__get8(s);                   // discard Offset
-    tga_colormap_type = stbi__get8(s); // colormap type
-    if( tga_colormap_type > 1 ) {
-        stbi__rewind(s);
-        return 0;      // only RGB or indexed allowed
-    }
-    tga_image_type = stbi__get8(s); // image type
-    if ( tga_colormap_type == 1 ) { // colormapped (paletted) image
-        if (tga_image_type != 1 && tga_image_type != 9) {
-            stbi__rewind(s);
-            return 0;
-        }
-        stbi__skip(s,4);       // skip index of first colormap entry and number of entries
-        sz = stbi__get8(s);    //   check bits per palette color entry
-        if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) {
-            stbi__rewind(s);
-            return 0;
-        }
-        stbi__skip(s,4);       // skip image x and y origin
-        tga_colormap_bpp = sz;
-    } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
-        if ( (tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11) ) {
-            stbi__rewind(s);
-            return 0; // only RGB or grey allowed, +/- RLE
-        }
-        stbi__skip(s,9); // skip colormap specification and image x/y origin
-        tga_colormap_bpp = 0;
-    }
-    tga_w = stbi__get16le(s);
-    if( tga_w < 1 ) {
-        stbi__rewind(s);
-        return 0;   // test width
-    }
-    tga_h = stbi__get16le(s);
-    if( tga_h < 1 ) {
-        stbi__rewind(s);
-        return 0;   // test height
-    }
-    tga_bits_per_pixel = stbi__get8(s); // bits per pixel
-    stbi__get8(s); // ignore alpha bits
-    if (tga_colormap_bpp != 0) {
-        if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
-            // when using a colormap, tga_bits_per_pixel is the size of the indexes
-            // I don't think anything but 8 or 16bit indexes makes sense
-            stbi__rewind(s);
-            return 0;
-        }
-        tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
-    } else {
-        tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
-    }
-    if(!tga_comp) {
-      stbi__rewind(s);
-      return 0;
-    }
-    if (x) *x = tga_w;
-    if (y) *y = tga_h;
-    if (comp) *comp = tga_comp;
-    return 1;                   // seems to have passed everything
-}
-
-static int stbi__tga_test(stbi__context *s)
-{
-   int res = 0;
-   int sz, tga_color_type;
-   stbi__get8(s);      //   discard Offset
-   tga_color_type = stbi__get8(s);   //   color type
-   if ( tga_color_type > 1 ) goto errorEnd;   //   only RGB or indexed allowed
-   sz = stbi__get8(s);   //   image type
-   if ( tga_color_type == 1 ) { // colormapped (paletted) image
-      if (sz != 1 && sz != 9) goto errorEnd; // colortype 1 demands image type 1 or 9
-      stbi__skip(s,4);       // skip index of first colormap entry and number of entries
-      sz = stbi__get8(s);    //   check bits per palette color entry
-      if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
-      stbi__skip(s,4);       // skip image x and y origin
-   } else { // "normal" image w/o colormap
-      if ( (sz != 2) && (sz != 3) && (sz != 10) && (sz != 11) ) goto errorEnd; // only RGB or grey allowed, +/- RLE
-      stbi__skip(s,9); // skip colormap specification and image x/y origin
-   }
-   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test width
-   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test height
-   sz = stbi__get8(s);   //   bits per pixel
-   if ( (tga_color_type == 1) && (sz != 8) && (sz != 16) ) goto errorEnd; // for colormapped images, bpp is size of an index
-   if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
-
-   res = 1; // if we got this far, everything's good and we can return 1 instead of 0
-
-errorEnd:
-   stbi__rewind(s);
-   return res;
-}
-
-// read 16bit value and convert to 24bit RGB
-static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
-{
-   stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
-   stbi__uint16 fiveBitMask = 31;
-   // we have 3 channels with 5bits each
-   int r = (px >> 10) & fiveBitMask;
-   int g = (px >> 5) & fiveBitMask;
-   int b = px & fiveBitMask;
-   // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
-   out[0] = (stbi_uc)((r * 255)/31);
-   out[1] = (stbi_uc)((g * 255)/31);
-   out[2] = (stbi_uc)((b * 255)/31);
-
-   // some people claim that the most significant bit might be used for alpha
-   // (possibly if an alpha-bit is set in the "image descriptor byte")
-   // but that only made 16bit test images completely translucent..
-   // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
-}
-
-static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
-{
-   //   read in the TGA header stuff
-   int tga_offset = stbi__get8(s);
-   int tga_indexed = stbi__get8(s);
-   int tga_image_type = stbi__get8(s);
-   int tga_is_RLE = 0;
-   int tga_palette_start = stbi__get16le(s);
-   int tga_palette_len = stbi__get16le(s);
-   int tga_palette_bits = stbi__get8(s);
-   int tga_x_origin = stbi__get16le(s);
-   int tga_y_origin = stbi__get16le(s);
-   int tga_width = stbi__get16le(s);
-   int tga_height = stbi__get16le(s);
-   int tga_bits_per_pixel = stbi__get8(s);
-   int tga_comp, tga_rgb16=0;
-   int tga_inverted = stbi__get8(s);
-   // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
-   //   image data
-   unsigned char *tga_data;
-   unsigned char *tga_palette = NULL;
-   int i, j;
-   unsigned char raw_data[4] = {0};
-   int RLE_count = 0;
-   int RLE_repeating = 0;
-   int read_next_pixel = 1;
-   STBI_NOTUSED(ri);
-   STBI_NOTUSED(tga_x_origin); // @TODO
-   STBI_NOTUSED(tga_y_origin); // @TODO
-
-   if (tga_height > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
-   if (tga_width > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
-
-   //   do a tiny bit of precessing
-   if ( tga_image_type >= 8 )
-   {
-      tga_image_type -= 8;
-      tga_is_RLE = 1;
-   }
-   tga_inverted = 1 - ((tga_inverted >> 5) & 1);
-
-   //   If I'm paletted, then I'll use the number of bits from the palette
-   if ( tga_indexed ) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
-   else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
-
-   if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
-      return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
-
-   //   tga info
-   *x = tga_width;
-   *y = tga_height;
-   if (comp) *comp = tga_comp;
-
-   if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
-      return stbi__errpuc("too large", "Corrupt TGA");
-
-   tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
-   if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
-
-   // skip to the data's starting position (offset usually = 0)
-   stbi__skip(s, tga_offset );
-
-   if ( !tga_indexed && !tga_is_RLE && !tga_rgb16 ) {
-      for (i=0; i < tga_height; ++i) {
-         int row = tga_inverted ? tga_height -i - 1 : i;
-         stbi_uc *tga_row = tga_data + row*tga_width*tga_comp;
-         stbi__getn(s, tga_row, tga_width * tga_comp);
-      }
-   } else  {
-      //   do I need to load a palette?
-      if ( tga_indexed)
-      {
-         if (tga_palette_len == 0) {  /* you have to have at least one entry! */
-            STBI_FREE(tga_data);
-            return stbi__errpuc("bad palette", "Corrupt TGA");
-         }
-
-         //   any data to skip? (offset usually = 0)
-         stbi__skip(s, tga_palette_start );
-         //   load the palette
-         tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
-         if (!tga_palette) {
-            STBI_FREE(tga_data);
-            return stbi__errpuc("outofmem", "Out of memory");
-         }
-         if (tga_rgb16) {
-            stbi_uc *pal_entry = tga_palette;
-            STBI_ASSERT(tga_comp == STBI_rgb);
-            for (i=0; i < tga_palette_len; ++i) {
-               stbi__tga_read_rgb16(s, pal_entry);
-               pal_entry += tga_comp;
-            }
-         } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
-               STBI_FREE(tga_data);
-               STBI_FREE(tga_palette);
-               return stbi__errpuc("bad palette", "Corrupt TGA");
-         }
-      }
-      //   load the data
-      for (i=0; i < tga_width * tga_height; ++i)
-      {
-         //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
-         if ( tga_is_RLE )
-         {
-            if ( RLE_count == 0 )
-            {
-               //   yep, get the next byte as a RLE command
-               int RLE_cmd = stbi__get8(s);
-               RLE_count = 1 + (RLE_cmd & 127);
-               RLE_repeating = RLE_cmd >> 7;
-               read_next_pixel = 1;
-            } else if ( !RLE_repeating )
-            {
-               read_next_pixel = 1;
-            }
-         } else
-         {
-            read_next_pixel = 1;
-         }
-         //   OK, if I need to read a pixel, do it now
-         if ( read_next_pixel )
-         {
-            //   load however much data we did have
-            if ( tga_indexed )
-            {
-               // read in index, then perform the lookup
-               int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
-               if ( pal_idx >= tga_palette_len ) {
-                  // invalid index
-                  pal_idx = 0;
-               }
-               pal_idx *= tga_comp;
-               for (j = 0; j < tga_comp; ++j) {
-                  raw_data[j] = tga_palette[pal_idx+j];
-               }
-            } else if(tga_rgb16) {
-               STBI_ASSERT(tga_comp == STBI_rgb);
-               stbi__tga_read_rgb16(s, raw_data);
-            } else {
-               //   read in the data raw
-               for (j = 0; j < tga_comp; ++j) {
-                  raw_data[j] = stbi__get8(s);
-               }
-            }
-            //   clear the reading flag for the next pixel
-            read_next_pixel = 0;
-         } // end of reading a pixel
-
-         // copy data
-         for (j = 0; j < tga_comp; ++j)
-           tga_data[i*tga_comp+j] = raw_data[j];
-
-         //   in case we're in RLE mode, keep counting down
-         --RLE_count;
-      }
-      //   do I need to invert the image?
-      if ( tga_inverted )
-      {
-         for (j = 0; j*2 < tga_height; ++j)
-         {
-            int index1 = j * tga_width * tga_comp;
-            int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
-            for (i = tga_width * tga_comp; i > 0; --i)
-            {
-               unsigned char temp = tga_data[index1];
-               tga_data[index1] = tga_data[index2];
-               tga_data[index2] = temp;
-               ++index1;
-               ++index2;
-            }
-         }
-      }
-      //   clear my palette, if I had one
-      if ( tga_palette != NULL )
-      {
-         STBI_FREE( tga_palette );
-      }
-   }
-
-   // swap RGB - if the source data was RGB16, it already is in the right order
-   if (tga_comp >= 3 && !tga_rgb16)
-   {
-      unsigned char* tga_pixel = tga_data;
-      for (i=0; i < tga_width * tga_height; ++i)
-      {
-         unsigned char temp = tga_pixel[0];
-         tga_pixel[0] = tga_pixel[2];
-         tga_pixel[2] = temp;
-         tga_pixel += tga_comp;
-      }
-   }
-
-   // convert to target component count
-   if (req_comp && req_comp != tga_comp)
-      tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height);
-
-   //   the things I do to get rid of an error message, and yet keep
-   //   Microsoft's C compilers happy... [8^(
-   tga_palette_start = tga_palette_len = tga_palette_bits =
-         tga_x_origin = tga_y_origin = 0;
-   STBI_NOTUSED(tga_palette_start);
-   //   OK, done
-   return tga_data;
-}
-#endif
-
-// *************************************************************************************************
-// Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, tweaked by STB
-
-#ifndef STBI_NO_PSD
-static int stbi__psd_test(stbi__context *s)
-{
-   int r = (stbi__get32be(s) == 0x38425053);
-   stbi__rewind(s);
-   return r;
-}
-
-static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount)
-{
-   int count, nleft, len;
-
-   count = 0;
-   while ((nleft = pixelCount - count) > 0) {
-      len = stbi__get8(s);
-      if (len == 128) {
-         // No-op.
-      } else if (len < 128) {
-         // Copy next len+1 bytes literally.
-         len++;
-         if (len > nleft) return 0; // corrupt data
-         count += len;
-         while (len) {
-            *p = stbi__get8(s);
-            p += 4;
-            len--;
-         }
-      } else if (len > 128) {
-         stbi_uc   val;
-         // Next -len+1 bytes in the dest are replicated from next source byte.
-         // (Interpret len as a negative 8-bit int.)
-         len = 257 - len;
-         if (len > nleft) return 0; // corrupt data
-         val = stbi__get8(s);
-         count += len;
-         while (len) {
-            *p = val;
-            p += 4;
-            len--;
-         }
-      }
-   }
-
-   return 1;
-}
-
-static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
-{
-   int pixelCount;
-   int channelCount, compression;
-   int channel, i;
-   int bitdepth;
-   int w,h;
-   stbi_uc *out;
-   STBI_NOTUSED(ri);
-
-   // Check identifier
-   if (stbi__get32be(s) != 0x38425053)   // "8BPS"
-      return stbi__errpuc("not PSD", "Corrupt PSD image");
-
-   // Check file type version.
-   if (stbi__get16be(s) != 1)
-      return stbi__errpuc("wrong version", "Unsupported version of PSD image");
-
-   // Skip 6 reserved bytes.
-   stbi__skip(s, 6 );
-
-   // Read the number of channels (R, G, B, A, etc).
-   channelCount = stbi__get16be(s);
-   if (channelCount < 0 || channelCount > 16)
-      return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image");
-
-   // Read the rows and columns of the image.
-   h = stbi__get32be(s);
-   w = stbi__get32be(s);
-
-   if (h > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
-   if (w > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
-
-   // Make sure the depth is 8 bits.
-   bitdepth = stbi__get16be(s);
-   if (bitdepth != 8 && bitdepth != 16)
-      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
-
-   // Make sure the color mode is RGB.
-   // Valid options are:
-   //   0: Bitmap
-   //   1: Grayscale
-   //   2: Indexed color
-   //   3: RGB color
-   //   4: CMYK color
-   //   7: Multichannel
-   //   8: Duotone
-   //   9: Lab color
-   if (stbi__get16be(s) != 3)
-      return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
-
-   // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
-   stbi__skip(s,stbi__get32be(s) );
-
-   // Skip the image resources.  (resolution, pen tool paths, etc)
-   stbi__skip(s, stbi__get32be(s) );
-
-   // Skip the reserved data.
-   stbi__skip(s, stbi__get32be(s) );
-
-   // Find out if the data is compressed.
-   // Known values:
-   //   0: no compression
-   //   1: RLE compressed
-   compression = stbi__get16be(s);
-   if (compression > 1)
-      return stbi__errpuc("bad compression", "PSD has an unknown compression format");
-
-   // Check size
-   if (!stbi__mad3sizes_valid(4, w, h, 0))
-      return stbi__errpuc("too large", "Corrupt PSD");
-
-   // Create the destination image.
-
-   if (!compression && bitdepth == 16 && bpc == 16) {
-      out = (stbi_uc *) stbi__malloc_mad3(8, w, h, 0);
-      ri->bits_per_channel = 16;
-   } else
-      out = (stbi_uc *) stbi__malloc(4 * w*h);
-
-   if (!out) return stbi__errpuc("outofmem", "Out of memory");
-   pixelCount = w*h;
-
-   // Initialize the data to zero.
-   //memset( out, 0, pixelCount * 4 );
-
-   // Finally, the image data.
-   if (compression) {
-      // RLE as used by .PSD and .TIFF
-      // Loop until you get the number of unpacked bytes you are expecting:
-      //     Read the next source byte into n.
-      //     If n is between 0 and 127 inclusive, copy the next n+1 bytes literally.
-      //     Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times.
-      //     Else if n is 128, noop.
-      // Endloop
-
-      // The RLE-compressed data is preceded by a 2-byte data count for each row in the data,
-      // which we're going to just skip.
-      stbi__skip(s, h * channelCount * 2 );
-
-      // Read the RLE data by channel.
-      for (channel = 0; channel < 4; channel++) {
-         stbi_uc *p;
-
-         p = out+channel;
-         if (channel >= channelCount) {
-            // Fill this channel with default data.
-            for (i = 0; i < pixelCount; i++, p += 4)
-               *p = (channel == 3 ? 255 : 0);
-         } else {
-            // Read the RLE data.
-            if (!stbi__psd_decode_rle(s, p, pixelCount)) {
-               STBI_FREE(out);
-               return stbi__errpuc("corrupt", "bad RLE data");
-            }
-         }
-      }
-
-   } else {
-      // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
-      // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
-
-      // Read the data by channel.
-      for (channel = 0; channel < 4; channel++) {
-         if (channel >= channelCount) {
-            // Fill this channel with default data.
-            if (bitdepth == 16 && bpc == 16) {
-               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
-               stbi__uint16 val = channel == 3 ? 65535 : 0;
-               for (i = 0; i < pixelCount; i++, q += 4)
-                  *q = val;
-            } else {
-               stbi_uc *p = out+channel;
-               stbi_uc val = channel == 3 ? 255 : 0;
-               for (i = 0; i < pixelCount; i++, p += 4)
-                  *p = val;
-            }
-         } else {
-            if (ri->bits_per_channel == 16) {    // output bpc
-               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
-               for (i = 0; i < pixelCount; i++, q += 4)
-                  *q = (stbi__uint16) stbi__get16be(s);
-            } else {
-               stbi_uc *p = out+channel;
-               if (bitdepth == 16) {  // input bpc
-                  for (i = 0; i < pixelCount; i++, p += 4)
-                     *p = (stbi_uc) (stbi__get16be(s) >> 8);
-               } else {
-                  for (i = 0; i < pixelCount; i++, p += 4)
-                     *p = stbi__get8(s);
-               }
-            }
-         }
-      }
-   }
-
-   // remove weird white matte from PSD
-   if (channelCount >= 4) {
-      if (ri->bits_per_channel == 16) {
-         for (i=0; i < w*h; ++i) {
-            stbi__uint16 *pixel = (stbi__uint16 *) out + 4*i;
-            if (pixel[3] != 0 && pixel[3] != 65535) {
-               float a = pixel[3] / 65535.0f;
-               float ra = 1.0f / a;
-               float inv_a = 65535.0f * (1 - ra);
-               pixel[0] = (stbi__uint16) (pixel[0]*ra + inv_a);
-               pixel[1] = (stbi__uint16) (pixel[1]*ra + inv_a);
-               pixel[2] = (stbi__uint16) (pixel[2]*ra + inv_a);
-            }
-         }
-      } else {
-         for (i=0; i < w*h; ++i) {
-            unsigned char *pixel = out + 4*i;
-            if (pixel[3] != 0 && pixel[3] != 255) {
-               float a = pixel[3] / 255.0f;
-               float ra = 1.0f / a;
-               float inv_a = 255.0f * (1 - ra);
-               pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
-               pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
-               pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
-            }
-         }
-      }
-   }
-
-   // convert to desired output format
-   if (req_comp && req_comp != 4) {
-      if (ri->bits_per_channel == 16)
-         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, 4, req_comp, w, h);
-      else
-         out = stbi__convert_format(out, 4, req_comp, w, h);
-      if (out == NULL) return out; // stbi__convert_format frees input on failure
-   }
-
-   if (comp) *comp = 4;
-   *y = h;
-   *x = w;
-
-   return out;
-}
-#endif
-
-// *************************************************************************************************
-// Softimage PIC loader
-// by Tom Seddon
-//
-// See http://softimage.wiki.softimage.com/index.php/INFO:_PIC_file_format
-// See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/
-
-#ifndef STBI_NO_PIC
-static int stbi__pic_is4(stbi__context *s,const char *str)
-{
-   int i;
-   for (i=0; i<4; ++i)
-      if (stbi__get8(s) != (stbi_uc)str[i])
-         return 0;
-
-   return 1;
-}
-
-static int stbi__pic_test_core(stbi__context *s)
-{
-   int i;
-
-   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34"))
-      return 0;
-
-   for(i=0;i<84;++i)
-      stbi__get8(s);
-
-   if (!stbi__pic_is4(s,"PICT"))
-      return 0;
-
-   return 1;
-}
-
-typedef struct
-{
-   stbi_uc size,type,channel;
-} stbi__pic_packet;
-
-static stbi_uc *stbi__readval(stbi__context *s, int channel, stbi_uc *dest)
-{
-   int mask=0x80, i;
-
-   for (i=0; i<4; ++i, mask>>=1) {
-      if (channel & mask) {
-         if (stbi__at_eof(s)) return stbi__errpuc("bad file","PIC file too short");
-         dest[i]=stbi__get8(s);
-      }
-   }
-
-   return dest;
-}
-
-static void stbi__copyval(int channel,stbi_uc *dest,const stbi_uc *src)
-{
-   int mask=0x80,i;
-
-   for (i=0;i<4; ++i, mask>>=1)
-      if (channel&mask)
-         dest[i]=src[i];
-}
-
-static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *comp, stbi_uc *result)
-{
-   int act_comp=0,num_packets=0,y,chained;
-   stbi__pic_packet packets[10];
-
-   // this will (should...) cater for even some bizarre stuff like having data
-    // for the same channel in multiple packets.
-   do {
-      stbi__pic_packet *packet;
-
-      if (num_packets==sizeof(packets)/sizeof(packets[0]))
-         return stbi__errpuc("bad format","too many packets");
-
-      packet = &packets[num_packets++];
-
-      chained = stbi__get8(s);
-      packet->size    = stbi__get8(s);
-      packet->type    = stbi__get8(s);
-      packet->channel = stbi__get8(s);
-
-      act_comp |= packet->channel;
-
-      if (stbi__at_eof(s))          return stbi__errpuc("bad file","file too short (reading packets)");
-      if (packet->size != 8)  return stbi__errpuc("bad format","packet isn't 8bpp");
-   } while (chained);
-
-   *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
-
-   for(y=0; y<height; ++y) {
-      int packet_idx;
-
-      for(packet_idx=0; packet_idx < num_packets; ++packet_idx) {
-         stbi__pic_packet *packet = &packets[packet_idx];
-         stbi_uc *dest = result+y*width*4;
-
-         switch (packet->type) {
-            default:
-               return stbi__errpuc("bad format","packet has bad compression type");
-
-            case 0: {//uncompressed
-               int x;
-
-               for(x=0;x<width;++x, dest+=4)
-                  if (!stbi__readval(s,packet->channel,dest))
-                     return 0;
-               break;
-            }
-
-            case 1://Pure RLE
-               {
-                  int left=width, i;
-
-                  while (left>0) {
-                     stbi_uc count,value[4];
-
-                     count=stbi__get8(s);
-                     if (stbi__at_eof(s))   return stbi__errpuc("bad file","file too short (pure read count)");
-
-                     if (count > left)
-                        count = (stbi_uc) left;
-
-                     if (!stbi__readval(s,packet->channel,value))  return 0;
-
-                     for(i=0; i<count; ++i,dest+=4)
-                        stbi__copyval(packet->channel,dest,value);
-                     left -= count;
-                  }
-               }
-               break;
-
-            case 2: {//Mixed RLE
-               int left=width;
-               while (left>0) {
-                  int count = stbi__get8(s), i;
-                  if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (mixed read count)");
-
-                  if (count >= 128) { // Repeated
-                     stbi_uc value[4];
-
-                     if (count==128)
-                        count = stbi__get16be(s);
-                     else
-                        count -= 127;
-                     if (count > left)
-                        return stbi__errpuc("bad file","scanline overrun");
-
-                     if (!stbi__readval(s,packet->channel,value))
-                        return 0;
-
-                     for(i=0;i<count;++i, dest += 4)
-                        stbi__copyval(packet->channel,dest,value);
-                  } else { // Raw
-                     ++count;
-                     if (count>left) return stbi__errpuc("bad file","scanline overrun");
-
-                     for(i=0;i<count;++i, dest+=4)
-                        if (!stbi__readval(s,packet->channel,dest))
-                           return 0;
-                  }
-                  left-=count;
-               }
-               break;
-            }
-         }
-      }
-   }
-
-   return result;
-}
-
-static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp, stbi__result_info *ri)
-{
-   stbi_uc *result;
-   int i, x,y, internal_comp;
-   STBI_NOTUSED(ri);
-
-   if (!comp) comp = &internal_comp;
-
-   for (i=0; i<92; ++i)
-      stbi__get8(s);
-
-   x = stbi__get16be(s);
-   y = stbi__get16be(s);
-
-   if (y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
-   if (x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
-
-   if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
-   if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode");
-
-   stbi__get32be(s); //skip `ratio'
-   stbi__get16be(s); //skip `fields'
-   stbi__get16be(s); //skip `pad'
-
-   // intermediate buffer is RGBA
-   result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
-   if (!result) return stbi__errpuc("outofmem", "Out of memory");
-   memset(result, 0xff, x*y*4);
-
-   if (!stbi__pic_load_core(s,x,y,comp, result)) {
-      STBI_FREE(result);
-      result=0;
-   }
-   *px = x;
-   *py = y;
-   if (req_comp == 0) req_comp = *comp;
-   result=stbi__convert_format(result,4,req_comp,x,y);
-
-   return result;
-}
-
-static int stbi__pic_test(stbi__context *s)
-{
-   int r = stbi__pic_test_core(s);
-   stbi__rewind(s);
-   return r;
-}
-#endif
-
-// *************************************************************************************************
-// GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
-
-#ifndef STBI_NO_GIF
-typedef struct
-{
-   stbi__int16 prefix;
-   stbi_uc first;
-   stbi_uc suffix;
-} stbi__gif_lzw;
-
-typedef struct
-{
-   int w,h;
-   stbi_uc *out;                 // output buffer (always 4 components)
-   stbi_uc *background;          // The current "background" as far as a gif is concerned
-   stbi_uc *history;
-   int flags, bgindex, ratio, transparent, eflags;
-   stbi_uc  pal[256][4];
-   stbi_uc lpal[256][4];
-   stbi__gif_lzw codes[8192];
-   stbi_uc *color_table;
-   int parse, step;
-   int lflags;
-   int start_x, start_y;
-   int max_x, max_y;
-   int cur_x, cur_y;
-   int line_size;
-   int delay;
-} stbi__gif;
-
-static int stbi__gif_test_raw(stbi__context *s)
-{
-   int sz;
-   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return 0;
-   sz = stbi__get8(s);
-   if (sz != '9' && sz != '7') return 0;
-   if (stbi__get8(s) != 'a') return 0;
-   return 1;
-}
-
-static int stbi__gif_test(stbi__context *s)
-{
-   int r = stbi__gif_test_raw(s);
-   stbi__rewind(s);
-   return r;
-}
-
-static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4], int num_entries, int transp)
-{
-   int i;
-   for (i=0; i < num_entries; ++i) {
-      pal[i][2] = stbi__get8(s);
-      pal[i][1] = stbi__get8(s);
-      pal[i][0] = stbi__get8(s);
-      pal[i][3] = transp == i ? 0 : 255;
-   }
-}
-
-static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_info)
-{
-   stbi_uc version;
-   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
-      return stbi__err("not GIF", "Corrupt GIF");
-
-   version = stbi__get8(s);
-   if (version != '7' && version != '9')    return stbi__err("not GIF", "Corrupt GIF");
-   if (stbi__get8(s) != 'a')                return stbi__err("not GIF", "Corrupt GIF");
-
-   stbi__g_failure_reason = "";
-   g->w = stbi__get16le(s);
-   g->h = stbi__get16le(s);
-   g->flags = stbi__get8(s);
-   g->bgindex = stbi__get8(s);
-   g->ratio = stbi__get8(s);
-   g->transparent = -1;
-
-   if (g->w > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
-   if (g->h > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
-
-   if (comp != 0) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
-
-   if (is_info) return 1;
-
-   if (g->flags & 0x80)
-      stbi__gif_parse_colortable(s,g->pal, 2 << (g->flags & 7), -1);
-
-   return 1;
-}
-
-static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
-{
-   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
-   if (!g) return stbi__err("outofmem", "Out of memory");
-   if (!stbi__gif_header(s, g, comp, 1)) {
-      STBI_FREE(g);
-      stbi__rewind( s );
-      return 0;
-   }
-   if (x) *x = g->w;
-   if (y) *y = g->h;
-   STBI_FREE(g);
-   return 1;
-}
-
-static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
-{
-   stbi_uc *p, *c;
-   int idx;
-
-   // recurse to decode the prefixes, since the linked-list is backwards,
-   // and working backwards through an interleaved image would be nasty
-   if (g->codes[code].prefix >= 0)
-      stbi__out_gif_code(g, g->codes[code].prefix);
-
-   if (g->cur_y >= g->max_y) return;
-
-   idx = g->cur_x + g->cur_y;
-   p = &g->out[idx];
-   g->history[idx / 4] = 1;
-
-   c = &g->color_table[g->codes[code].suffix * 4];
-   if (c[3] > 128) { // don't render transparent pixels;
-      p[0] = c[2];
-      p[1] = c[1];
-      p[2] = c[0];
-      p[3] = c[3];
-   }
-   g->cur_x += 4;
-
-   if (g->cur_x >= g->max_x) {
-      g->cur_x = g->start_x;
-      g->cur_y += g->step;
-
-      while (g->cur_y >= g->max_y && g->parse > 0) {
-         g->step = (1 << g->parse) * g->line_size;
-         g->cur_y = g->start_y + (g->step >> 1);
-         --g->parse;
-      }
-   }
-}
-
-static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
-{
-   stbi_uc lzw_cs;
-   stbi__int32 len, init_code;
-   stbi__uint32 first;
-   stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
-   stbi__gif_lzw *p;
-
-   lzw_cs = stbi__get8(s);
-   if (lzw_cs > 12) return NULL;
-   clear = 1 << lzw_cs;
-   first = 1;
-   codesize = lzw_cs + 1;
-   codemask = (1 << codesize) - 1;
-   bits = 0;
-   valid_bits = 0;
-   for (init_code = 0; init_code < clear; init_code++) {
-      g->codes[init_code].prefix = -1;
-      g->codes[init_code].first = (stbi_uc) init_code;
-      g->codes[init_code].suffix = (stbi_uc) init_code;
-   }
-
-   // support no starting clear code
-   avail = clear+2;
-   oldcode = -1;
-
-   len = 0;
-   for(;;) {
-      if (valid_bits < codesize) {
-         if (len == 0) {
-            len = stbi__get8(s); // start new block
-            if (len == 0)
-               return g->out;
-         }
-         --len;
-         bits |= (stbi__int32) stbi__get8(s) << valid_bits;
-         valid_bits += 8;
-      } else {
-         stbi__int32 code = bits & codemask;
-         bits >>= codesize;
-         valid_bits -= codesize;
-         // @OPTIMIZE: is there some way we can accelerate the non-clear path?
-         if (code == clear) {  // clear code
-            codesize = lzw_cs + 1;
-            codemask = (1 << codesize) - 1;
-            avail = clear + 2;
-            oldcode = -1;
-            first = 0;
-         } else if (code == clear + 1) { // end of stream code
-            stbi__skip(s, len);
-            while ((len = stbi__get8(s)) > 0)
-               stbi__skip(s,len);
-            return g->out;
-         } else if (code <= avail) {
-            if (first) {
-               return stbi__errpuc("no clear code", "Corrupt GIF");
-            }
-
-            if (oldcode >= 0) {
-               p = &g->codes[avail++];
-               if (avail > 8192) {
-                  return stbi__errpuc("too many codes", "Corrupt GIF");
-               }
-
-               p->prefix = (stbi__int16) oldcode;
-               p->first = g->codes[oldcode].first;
-               p->suffix = (code == avail) ? p->first : g->codes[code].first;
-            } else if (code == avail)
-               return stbi__errpuc("illegal code in raster", "Corrupt GIF");
-
-            stbi__out_gif_code(g, (stbi__uint16) code);
-
-            if ((avail & codemask) == 0 && avail <= 0x0FFF) {
-               codesize++;
-               codemask = (1 << codesize) - 1;
-            }
-
-            oldcode = code;
-         } else {
-            return stbi__errpuc("illegal code in raster", "Corrupt GIF");
-         }
-      }
-   }
-}
-
-// this function is designed to support animated gifs, although stb_image doesn't support it
-// two back is the image from two frames ago, used for a very specific disposal format
-static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp, stbi_uc *two_back)
-{
-   int dispose;
-   int first_frame;
-   int pi;
-   int pcount;
-   STBI_NOTUSED(req_comp);
-
-   // on first frame, any non-written pixels get the background colour (non-transparent)
-   first_frame = 0;
-   if (g->out == 0) {
-      if (!stbi__gif_header(s, g, comp,0)) return 0; // stbi__g_failure_reason set by stbi__gif_header
-      if (!stbi__mad3sizes_valid(4, g->w, g->h, 0))
-         return stbi__errpuc("too large", "GIF image is too large");
-      pcount = g->w * g->h;
-      g->out = (stbi_uc *) stbi__malloc(4 * pcount);
-      g->background = (stbi_uc *) stbi__malloc(4 * pcount);
-      g->history = (stbi_uc *) stbi__malloc(pcount);
-      if (!g->out || !g->background || !g->history)
-         return stbi__errpuc("outofmem", "Out of memory");
-
-      // image is treated as "transparent" at the start - ie, nothing overwrites the current background;
-      // background colour is only used for pixels that are not rendered first frame, after that "background"
-      // color refers to the color that was there the previous frame.
-      memset(g->out, 0x00, 4 * pcount);
-      memset(g->background, 0x00, 4 * pcount); // state of the background (starts transparent)
-      memset(g->history, 0x00, pcount);        // pixels that were affected previous frame
-      first_frame = 1;
-   } else {
-      // second frame - how do we dispose of the previous one?
-      dispose = (g->eflags & 0x1C) >> 2;
-      pcount = g->w * g->h;
-
-      if ((dispose == 3) && (two_back == 0)) {
-         dispose = 2; // if I don't have an image to revert back to, default to the old background
-      }
-
-      if (dispose == 3) { // use previous graphic
-         for (pi = 0; pi < pcount; ++pi) {
-            if (g->history[pi]) {
-               memcpy( &g->out[pi * 4], &two_back[pi * 4], 4 );
-            }
-         }
-      } else if (dispose == 2) {
-         // restore what was changed last frame to background before that frame;
-         for (pi = 0; pi < pcount; ++pi) {
-            if (g->history[pi]) {
-               memcpy( &g->out[pi * 4], &g->background[pi * 4], 4 );
-            }
-         }
-      } else {
-         // This is a non-disposal case eithe way, so just
-         // leave the pixels as is, and they will become the new background
-         // 1: do not dispose
-         // 0:  not specified.
-      }
-
-      // background is what out is after the undoing of the previou frame;
-      memcpy( g->background, g->out, 4 * g->w * g->h );
-   }
-
-   // clear my history;
-   memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
-
-   for (;;) {
-      int tag = stbi__get8(s);
-      switch (tag) {
-         case 0x2C: /* Image Descriptor */
-         {
-            stbi__int32 x, y, w, h;
-            stbi_uc *o;
-
-            x = stbi__get16le(s);
-            y = stbi__get16le(s);
-            w = stbi__get16le(s);
-            h = stbi__get16le(s);
-            if (((x + w) > (g->w)) || ((y + h) > (g->h)))
-               return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
-
-            g->line_size = g->w * 4;
-            g->start_x = x * 4;
-            g->start_y = y * g->line_size;
-            g->max_x   = g->start_x + w * 4;
-            g->max_y   = g->start_y + h * g->line_size;
-            g->cur_x   = g->start_x;
-            g->cur_y   = g->start_y;
-
-            // if the width of the specified rectangle is 0, that means
-            // we may not see *any* pixels or the image is malformed;
-            // to make sure this is caught, move the current y down to
-            // max_y (which is what out_gif_code checks).
-            if (w == 0)
-               g->cur_y = g->max_y;
-
-            g->lflags = stbi__get8(s);
-
-            if (g->lflags & 0x40) {
-               g->step = 8 * g->line_size; // first interlaced spacing
-               g->parse = 3;
-            } else {
-               g->step = g->line_size;
-               g->parse = 0;
-            }
-
-            if (g->lflags & 0x80) {
-               stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
-               g->color_table = (stbi_uc *) g->lpal;
-            } else if (g->flags & 0x80) {
-               g->color_table = (stbi_uc *) g->pal;
-            } else
-               return stbi__errpuc("missing color table", "Corrupt GIF");
-
-            o = stbi__process_gif_raster(s, g);
-            if (!o) return NULL;
-
-            // if this was the first frame,
-            pcount = g->w * g->h;
-            if (first_frame && (g->bgindex > 0)) {
-               // if first frame, any pixel not drawn to gets the background color
-               for (pi = 0; pi < pcount; ++pi) {
-                  if (g->history[pi] == 0) {
-                     g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will be reset next frame if need be;
-                     memcpy( &g->out[pi * 4], &g->pal[g->bgindex], 4 );
-                  }
-               }
-            }
-
-            return o;
-         }
-
-         case 0x21: // Comment Extension.
-         {
-            int len;
-            int ext = stbi__get8(s);
-            if (ext == 0xF9) { // Graphic Control Extension.
-               len = stbi__get8(s);
-               if (len == 4) {
-                  g->eflags = stbi__get8(s);
-                  g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
-
-                  // unset old transparent
-                  if (g->transparent >= 0) {
-                     g->pal[g->transparent][3] = 255;
-                  }
-                  if (g->eflags & 0x01) {
-                     g->transparent = stbi__get8(s);
-                     if (g->transparent >= 0) {
-                        g->pal[g->transparent][3] = 0;
-                     }
-                  } else {
-                     // don't need transparent
-                     stbi__skip(s, 1);
-                     g->transparent = -1;
-                  }
-               } else {
-                  stbi__skip(s, len);
-                  break;
-               }
-            }
-            while ((len = stbi__get8(s)) != 0) {
-               stbi__skip(s, len);
-            }
-            break;
-         }
-
-         case 0x3B: // gif stream termination code
-            return (stbi_uc *) s; // using '1' causes warning on some compilers
-
-         default:
-            return stbi__errpuc("unknown code", "Corrupt GIF");
-      }
-   }
-}
-
-static void *stbi__load_gif_main_outofmem(stbi__gif *g, stbi_uc *out, int **delays)
-{
-   STBI_FREE(g->out);
-   STBI_FREE(g->history);
-   STBI_FREE(g->background);
-
-   if (out) STBI_FREE(out);
-   if (delays && *delays) STBI_FREE(*delays);
-   return stbi__errpuc("outofmem", "Out of memory");
-}
-
-static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
-{
-   if (stbi__gif_test(s)) {
-      int layers = 0;
-      stbi_uc *u = 0;
-      stbi_uc *out = 0;
-      stbi_uc *two_back = 0;
-      stbi__gif g;
-      int stride;
-      int out_size = 0;
-      int delays_size = 0;
-
-      STBI_NOTUSED(out_size);
-      STBI_NOTUSED(delays_size);
-
-      memset(&g, 0, sizeof(g));
-      if (delays) {
-         *delays = 0;
-      }
-
-      do {
-         u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
-         if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
-
-         if (u) {
-            *x = g.w;
-            *y = g.h;
-            ++layers;
-            stride = g.w * g.h * 4;
-
-            if (out) {
-               void *tmp = (stbi_uc*) STBI_REALLOC_SIZED( out, out_size, layers * stride );
-               if (!tmp)
-                  return stbi__load_gif_main_outofmem(&g, out, delays);
-               else {
-                   out = (stbi_uc*) tmp;
-                   out_size = layers * stride;
-               }
-
-               if (delays) {
-                  int *new_delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers );
-                  if (!new_delays)
-                     return stbi__load_gif_main_outofmem(&g, out, delays);
-                  *delays = new_delays;
-                  delays_size = layers * sizeof(int);
-               }
-            } else {
-               out = (stbi_uc*)stbi__malloc( layers * stride );
-               if (!out)
-                  return stbi__load_gif_main_outofmem(&g, out, delays);
-               out_size = layers * stride;
-               if (delays) {
-                  *delays = (int*) stbi__malloc( layers * sizeof(int) );
-                  if (!*delays)
-                     return stbi__load_gif_main_outofmem(&g, out, delays);
-                  delays_size = layers * sizeof(int);
-               }
-            }
-            memcpy( out + ((layers - 1) * stride), u, stride );
-            if (layers >= 2) {
-               two_back = out - 2 * stride;
-            }
-
-            if (delays) {
-               (*delays)[layers - 1U] = g.delay;
-            }
-         }
-      } while (u != 0);
-
-      // free temp buffer;
-      STBI_FREE(g.out);
-      STBI_FREE(g.history);
-      STBI_FREE(g.background);
-
-      // do the final conversion after loading everything;
-      if (req_comp && req_comp != 4)
-         out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
-
-      *z = layers;
-      return out;
-   } else {
-      return stbi__errpuc("not GIF", "Image was not as a gif type.");
-   }
-}
-
-static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
-{
-   stbi_uc *u = 0;
-   stbi__gif g;
-   memset(&g, 0, sizeof(g));
-   STBI_NOTUSED(ri);
-
-   u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
-   if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
-   if (u) {
-      *x = g.w;
-      *y = g.h;
-
-      // moved conversion to after successful load so that the same
-      // can be done for multiple frames.
-      if (req_comp && req_comp != 4)
-         u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
-   } else if (g.out) {
-      // if there was an error and we allocated an image buffer, free it!
-      STBI_FREE(g.out);
-   }
-
-   // free buffers needed for multiple frame loading;
-   STBI_FREE(g.history);
-   STBI_FREE(g.background);
-
-   return u;
-}
-
-static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   return stbi__gif_info_raw(s,x,y,comp);
-}
-#endif
-
-// *************************************************************************************************
-// Radiance RGBE HDR loader
-// originally by Nicolas Schulz
-#ifndef STBI_NO_HDR
-static int stbi__hdr_test_core(stbi__context *s, const char *signature)
-{
-   int i;
-   for (i=0; signature[i]; ++i)
-      if (stbi__get8(s) != signature[i])
-          return 0;
-   stbi__rewind(s);
-   return 1;
-}
-
-static int stbi__hdr_test(stbi__context* s)
-{
-   int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
-   stbi__rewind(s);
-   if(!r) {
-       r = stbi__hdr_test_core(s, "#?RGBE\n");
-       stbi__rewind(s);
-   }
-   return r;
-}
-
-#define STBI__HDR_BUFLEN  1024
-static char *stbi__hdr_gettoken(stbi__context *z, char *buffer)
-{
-   int len=0;
-   char c = '\0';
-
-   c = (char) stbi__get8(z);
-
-   while (!stbi__at_eof(z) && c != '\n') {
-      buffer[len++] = c;
-      if (len == STBI__HDR_BUFLEN-1) {
-         // flush to end of line
-         while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
-            ;
-         break;
-      }
-      c = (char) stbi__get8(z);
-   }
-
-   buffer[len] = 0;
-   return buffer;
-}
-
-static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
-{
-   if ( input[3] != 0 ) {
-      float f1;
-      // Exponent
-      f1 = (float) ldexp(1.0f, input[3] - (int)(128 + 8));
-      if (req_comp <= 2)
-         output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
-      else {
-         output[0] = input[0] * f1;
-         output[1] = input[1] * f1;
-         output[2] = input[2] * f1;
-      }
-      if (req_comp == 2) output[1] = 1;
-      if (req_comp == 4) output[3] = 1;
-   } else {
-      switch (req_comp) {
-         case 4: output[3] = 1; /* fallthrough */
-         case 3: output[0] = output[1] = output[2] = 0;
-                 break;
-         case 2: output[1] = 1; /* fallthrough */
-         case 1: output[0] = 0;
-                 break;
-      }
-   }
-}
-
-static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
-{
-   char buffer[STBI__HDR_BUFLEN];
-   char *token;
-   int valid = 0;
-   int width, height;
-   stbi_uc *scanline;
-   float *hdr_data;
-   int len;
-   unsigned char count, value;
-   int i, j, k, c1,c2, z;
-   const char *headerToken;
-   STBI_NOTUSED(ri);
-
-   // Check identifier
-   headerToken = stbi__hdr_gettoken(s,buffer);
-   if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
-      return stbi__errpf("not HDR", "Corrupt HDR image");
-
-   // Parse header
-   for(;;) {
-      token = stbi__hdr_gettoken(s,buffer);
-      if (token[0] == 0) break;
-      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
-   }
-
-   if (!valid)    return stbi__errpf("unsupported format", "Unsupported HDR format");
-
-   // Parse width and height
-   // can't use sscanf() if we're not using stdio!
-   token = stbi__hdr_gettoken(s,buffer);
-   if (strncmp(token, "-Y ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
-   token += 3;
-   height = (int) strtol(token, &token, 10);
-   while (*token == ' ') ++token;
-   if (strncmp(token, "+X ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
-   token += 3;
-   width = (int) strtol(token, NULL, 10);
-
-   if (height > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
-   if (width > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
-
-   *x = width;
-   *y = height;
-
-   if (comp) *comp = 3;
-   if (req_comp == 0) req_comp = 3;
-
-   if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
-      return stbi__errpf("too large", "HDR image is too large");
-
-   // Read data
-   hdr_data = (float *) stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
-   if (!hdr_data)
-      return stbi__errpf("outofmem", "Out of memory");
-
-   // Load image data
-   // image data is stored as some number of sca
-   if ( width < 8 || width >= 32768) {
-      // Read flat data
-      for (j=0; j < height; ++j) {
-         for (i=0; i < width; ++i) {
-            stbi_uc rgbe[4];
-           main_decode_loop:
-            stbi__getn(s, rgbe, 4);
-            stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
-         }
-      }
-   } else {
-      // Read RLE-encoded data
-      scanline = NULL;
-
-      for (j = 0; j < height; ++j) {
-         c1 = stbi__get8(s);
-         c2 = stbi__get8(s);
-         len = stbi__get8(s);
-         if (c1 != 2 || c2 != 2 || (len & 0x80)) {
-            // not run-length encoded, so we have to actually use THIS data as a decoded
-            // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
-            stbi_uc rgbe[4];
-            rgbe[0] = (stbi_uc) c1;
-            rgbe[1] = (stbi_uc) c2;
-            rgbe[2] = (stbi_uc) len;
-            rgbe[3] = (stbi_uc) stbi__get8(s);
-            stbi__hdr_convert(hdr_data, rgbe, req_comp);
-            i = 1;
-            j = 0;
-            STBI_FREE(scanline);
-            goto main_decode_loop; // yes, this makes no sense
-         }
-         len <<= 8;
-         len |= stbi__get8(s);
-         if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
-         if (scanline == NULL) {
-            scanline = (stbi_uc *) stbi__malloc_mad2(width, 4, 0);
-            if (!scanline) {
-               STBI_FREE(hdr_data);
-               return stbi__errpf("outofmem", "Out of memory");
-            }
-         }
-
-         for (k = 0; k < 4; ++k) {
-            int nleft;
-            i = 0;
-            while ((nleft = width - i) > 0) {
-               count = stbi__get8(s);
-               if (count > 128) {
-                  // Run
-                  value = stbi__get8(s);
-                  count -= 128;
-                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
-                  for (z = 0; z < count; ++z)
-                     scanline[i++ * 4 + k] = value;
-               } else {
-                  // Dump
-                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
-                  for (z = 0; z < count; ++z)
-                     scanline[i++ * 4 + k] = stbi__get8(s);
-               }
-            }
-         }
-         for (i=0; i < width; ++i)
-            stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
-      }
-      if (scanline)
-         STBI_FREE(scanline);
-   }
-
-   return hdr_data;
-}
-
-static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   char buffer[STBI__HDR_BUFLEN];
-   char *token;
-   int valid = 0;
-   int dummy;
-
-   if (!x) x = &dummy;
-   if (!y) y = &dummy;
-   if (!comp) comp = &dummy;
-
-   if (stbi__hdr_test(s) == 0) {
-       stbi__rewind( s );
-       return 0;
-   }
-
-   for(;;) {
-      token = stbi__hdr_gettoken(s,buffer);
-      if (token[0] == 0) break;
-      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
-   }
-
-   if (!valid) {
-       stbi__rewind( s );
-       return 0;
-   }
-   token = stbi__hdr_gettoken(s,buffer);
-   if (strncmp(token, "-Y ", 3)) {
-       stbi__rewind( s );
-       return 0;
-   }
-   token += 3;
-   *y = (int) strtol(token, &token, 10);
-   while (*token == ' ') ++token;
-   if (strncmp(token, "+X ", 3)) {
-       stbi__rewind( s );
-       return 0;
-   }
-   token += 3;
-   *x = (int) strtol(token, NULL, 10);
-   *comp = 3;
-   return 1;
-}
-#endif // STBI_NO_HDR
-
-#ifndef STBI_NO_BMP
-static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   void *p;
-   stbi__bmp_data info;
-
-   info.all_a = 255;
-   p = stbi__bmp_parse_header(s, &info);
-   if (p == NULL) {
-      stbi__rewind( s );
-      return 0;
-   }
-   if (x) *x = s->img_x;
-   if (y) *y = s->img_y;
-   if (comp) {
-      if (info.bpp == 24 && info.ma == 0xff000000)
-         *comp = 3;
-      else
-         *comp = info.ma ? 4 : 3;
-   }
-   return 1;
-}
-#endif
-
-#ifndef STBI_NO_PSD
-static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   int channelCount, dummy, depth;
-   if (!x) x = &dummy;
-   if (!y) y = &dummy;
-   if (!comp) comp = &dummy;
-   if (stbi__get32be(s) != 0x38425053) {
-       stbi__rewind( s );
-       return 0;
-   }
-   if (stbi__get16be(s) != 1) {
-       stbi__rewind( s );
-       return 0;
-   }
-   stbi__skip(s, 6);
-   channelCount = stbi__get16be(s);
-   if (channelCount < 0 || channelCount > 16) {
-       stbi__rewind( s );
-       return 0;
-   }
-   *y = stbi__get32be(s);
-   *x = stbi__get32be(s);
-   depth = stbi__get16be(s);
-   if (depth != 8 && depth != 16) {
-       stbi__rewind( s );
-       return 0;
-   }
-   if (stbi__get16be(s) != 3) {
-       stbi__rewind( s );
-       return 0;
-   }
-   *comp = 4;
-   return 1;
-}
-
-static int stbi__psd_is16(stbi__context *s)
-{
-   int channelCount, depth;
-   if (stbi__get32be(s) != 0x38425053) {
-       stbi__rewind( s );
-       return 0;
-   }
-   if (stbi__get16be(s) != 1) {
-       stbi__rewind( s );
-       return 0;
-   }
-   stbi__skip(s, 6);
-   channelCount = stbi__get16be(s);
-   if (channelCount < 0 || channelCount > 16) {
-       stbi__rewind( s );
-       return 0;
-   }
-   STBI_NOTUSED(stbi__get32be(s));
-   STBI_NOTUSED(stbi__get32be(s));
-   depth = stbi__get16be(s);
-   if (depth != 16) {
-       stbi__rewind( s );
-       return 0;
-   }
-   return 1;
-}
-#endif
-
-#ifndef STBI_NO_PIC
-static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   int act_comp=0,num_packets=0,chained,dummy;
-   stbi__pic_packet packets[10];
-
-   if (!x) x = &dummy;
-   if (!y) y = &dummy;
-   if (!comp) comp = &dummy;
-
-   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) {
-      stbi__rewind(s);
-      return 0;
-   }
-
-   stbi__skip(s, 88);
-
-   *x = stbi__get16be(s);
-   *y = stbi__get16be(s);
-   if (stbi__at_eof(s)) {
-      stbi__rewind( s);
-      return 0;
-   }
-   if ( (*x) != 0 && (1 << 28) / (*x) < (*y)) {
-      stbi__rewind( s );
-      return 0;
-   }
-
-   stbi__skip(s, 8);
-
-   do {
-      stbi__pic_packet *packet;
-
-      if (num_packets==sizeof(packets)/sizeof(packets[0]))
-         return 0;
-
-      packet = &packets[num_packets++];
-      chained = stbi__get8(s);
-      packet->size    = stbi__get8(s);
-      packet->type    = stbi__get8(s);
-      packet->channel = stbi__get8(s);
-      act_comp |= packet->channel;
-
-      if (stbi__at_eof(s)) {
-          stbi__rewind( s );
-          return 0;
-      }
-      if (packet->size != 8) {
-          stbi__rewind( s );
-          return 0;
-      }
-   } while (chained);
-
-   *comp = (act_comp & 0x10 ? 4 : 3);
-
-   return 1;
-}
-#endif
-
-// *************************************************************************************************
-// Portable Gray Map and Portable Pixel Map loader
-// by Ken Miller
-//
-// PGM: http://netpbm.sourceforge.net/doc/pgm.html
-// PPM: http://netpbm.sourceforge.net/doc/ppm.html
-//
-// Known limitations:
-//    Does not support comments in the header section
-//    Does not support ASCII image data (formats P2 and P3)
-
-#ifndef STBI_NO_PNM
-
-static int      stbi__pnm_test(stbi__context *s)
-{
-   char p, t;
-   p = (char) stbi__get8(s);
-   t = (char) stbi__get8(s);
-   if (p != 'P' || (t != '5' && t != '6')) {
-       stbi__rewind( s );
-       return 0;
-   }
-   return 1;
-}
-
-static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
-{
-   stbi_uc *out;
-   STBI_NOTUSED(ri);
-
-   ri->bits_per_channel = stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n);
-   if (ri->bits_per_channel == 0)
-      return 0;
-
-   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
-   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
-
-   *x = s->img_x;
-   *y = s->img_y;
-   if (comp) *comp = s->img_n;
-
-   if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0))
-      return stbi__errpuc("too large", "PNM too large");
-
-   out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0);
-   if (!out) return stbi__errpuc("outofmem", "Out of memory");
-   if (!stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8))) {
-      STBI_FREE(out);
-      return stbi__errpuc("bad PNM", "PNM file truncated");
-   }
-
-   if (req_comp && req_comp != s->img_n) {
-      if (ri->bits_per_channel == 16) {
-         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, s->img_n, req_comp, s->img_x, s->img_y);
-      } else {
-         out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
-      }
-      if (out == NULL) return out; // stbi__convert_format frees input on failure
-   }
-   return out;
-}
-
-static int      stbi__pnm_isspace(char c)
-{
-   return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
-}
-
-static void     stbi__pnm_skip_whitespace(stbi__context *s, char *c)
-{
-   for (;;) {
-      while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
-         *c = (char) stbi__get8(s);
-
-      if (stbi__at_eof(s) || *c != '#')
-         break;
-
-      while (!stbi__at_eof(s) && *c != '\n' && *c != '\r' )
-         *c = (char) stbi__get8(s);
-   }
-}
-
-static int      stbi__pnm_isdigit(char c)
-{
-   return c >= '0' && c <= '9';
-}
-
-static int      stbi__pnm_getinteger(stbi__context *s, char *c)
-{
-   int value = 0;
-
-   while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
-      value = value*10 + (*c - '0');
-      *c = (char) stbi__get8(s);
-      if((value > 214748364) || (value == 214748364 && *c > '7'))
-          return stbi__err("integer parse overflow", "Parsing an integer in the PPM header overflowed a 32-bit int");
-   }
-
-   return value;
-}
-
-static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   int maxv, dummy;
-   char c, p, t;
-
-   if (!x) x = &dummy;
-   if (!y) y = &dummy;
-   if (!comp) comp = &dummy;
-
-   stbi__rewind(s);
-
-   // Get identifier
-   p = (char) stbi__get8(s);
-   t = (char) stbi__get8(s);
-   if (p != 'P' || (t != '5' && t != '6')) {
-       stbi__rewind(s);
-       return 0;
-   }
-
-   *comp = (t == '6') ? 3 : 1;  // '5' is 1-component .pgm; '6' is 3-component .ppm
-
-   c = (char) stbi__get8(s);
-   stbi__pnm_skip_whitespace(s, &c);
-
-   *x = stbi__pnm_getinteger(s, &c); // read width
-   if(*x == 0)
-       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
-   stbi__pnm_skip_whitespace(s, &c);
-
-   *y = stbi__pnm_getinteger(s, &c); // read height
-   if (*y == 0)
-       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
-   stbi__pnm_skip_whitespace(s, &c);
-
-   maxv = stbi__pnm_getinteger(s, &c);  // read max value
-   if (maxv > 65535)
-      return stbi__err("max value > 65535", "PPM image supports only 8-bit and 16-bit images");
-   else if (maxv > 255)
-      return 16;
-   else
-      return 8;
-}
-
-static int stbi__pnm_is16(stbi__context *s)
-{
-   if (stbi__pnm_info(s, NULL, NULL, NULL) == 16)
-	   return 1;
-   return 0;
-}
-#endif
-
-static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
-{
-   #ifndef STBI_NO_JPEG
-   if (stbi__jpeg_info(s, x, y, comp)) return 1;
-   #endif
-
-   #ifndef STBI_NO_PNG
-   if (stbi__png_info(s, x, y, comp))  return 1;
-   #endif
-
-   #ifndef STBI_NO_GIF
-   if (stbi__gif_info(s, x, y, comp))  return 1;
-   #endif
-
-   #ifndef STBI_NO_BMP
-   if (stbi__bmp_info(s, x, y, comp))  return 1;
-   #endif
-
-   #ifndef STBI_NO_PSD
-   if (stbi__psd_info(s, x, y, comp))  return 1;
-   #endif
-
-   #ifndef STBI_NO_PIC
-   if (stbi__pic_info(s, x, y, comp))  return 1;
-   #endif
-
-   #ifndef STBI_NO_PNM
-   if (stbi__pnm_info(s, x, y, comp))  return 1;
-   #endif
-
-   #ifndef STBI_NO_HDR
-   if (stbi__hdr_info(s, x, y, comp))  return 1;
-   #endif
-
-   // test tga last because it's a crappy test!
-   #ifndef STBI_NO_TGA
-   if (stbi__tga_info(s, x, y, comp))
-       return 1;
-   #endif
-   return stbi__err("unknown image type", "Image not of any known type, or corrupt");
-}
-
-static int stbi__is_16_main(stbi__context *s)
-{
-   #ifndef STBI_NO_PNG
-   if (stbi__png_is16(s))  return 1;
-   #endif
-
-   #ifndef STBI_NO_PSD
-   if (stbi__psd_is16(s))  return 1;
-   #endif
-
-   #ifndef STBI_NO_PNM
-   if (stbi__pnm_is16(s))  return 1;
-   #endif
-   return 0;
-}
-
-#ifndef STBI_NO_STDIO
-STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp)
-{
-    FILE *f = stbi__fopen(filename, "rb");
-    int result;
-    if (!f) return stbi__err("can't fopen", "Unable to open file");
-    result = stbi_info_from_file(f, x, y, comp);
-    fclose(f);
-    return result;
-}
-
-STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
-{
-   int r;
-   stbi__context s;
-   long pos = ftell(f);
-   stbi__start_file(&s, f);
-   r = stbi__info_main(&s,x,y,comp);
-   fseek(f,pos,SEEK_SET);
-   return r;
-}
-
-STBIDEF int stbi_is_16_bit(char const *filename)
-{
-    FILE *f = stbi__fopen(filename, "rb");
-    int result;
-    if (!f) return stbi__err("can't fopen", "Unable to open file");
-    result = stbi_is_16_bit_from_file(f);
-    fclose(f);
-    return result;
-}
-
-STBIDEF int stbi_is_16_bit_from_file(FILE *f)
-{
-   int r;
-   stbi__context s;
-   long pos = ftell(f);
-   stbi__start_file(&s, f);
-   r = stbi__is_16_main(&s);
-   fseek(f,pos,SEEK_SET);
-   return r;
-}
-#endif // !STBI_NO_STDIO
-
-STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
-{
-   stbi__context s;
-   stbi__start_mem(&s,buffer,len);
-   return stbi__info_main(&s,x,y,comp);
-}
-
-STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int *x, int *y, int *comp)
-{
-   stbi__context s;
-   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
-   return stbi__info_main(&s,x,y,comp);
-}
-
-STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len)
-{
-   stbi__context s;
-   stbi__start_mem(&s,buffer,len);
-   return stbi__is_16_main(&s);
-}
-
-STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user)
-{
-   stbi__context s;
-   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
-   return stbi__is_16_main(&s);
-}
-
-#endif // STB_IMAGE_IMPLEMENTATION
-
-/*
-   revision history:
-      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
-      2.19  (2018-02-11) fix warning
-      2.18  (2018-01-30) fix warnings
-      2.17  (2018-01-29) change sbti__shiftsigned to avoid clang -O2 bug
-                         1-bit BMP
-                         *_is_16_bit api
-                         avoid warnings
-      2.16  (2017-07-23) all functions have 16-bit variants;
-                         STBI_NO_STDIO works again;
-                         compilation fixes;
-                         fix rounding in unpremultiply;
-                         optimize vertical flip;
-                         disable raw_len validation;
-                         documentation fixes
-      2.15  (2017-03-18) fix png-1,2,4 bug; now all Imagenet JPGs decode;
-                         warning fixes; disable run-time SSE detection on gcc;
-                         uniform handling of optional "return" values;
-                         thread-safe initialization of zlib tables
-      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
-      2.13  (2016-11-29) add 16-bit API, only supported for PNG right now
-      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
-      2.11  (2016-04-02) allocate large structures on the stack
-                         remove white matting for transparent PSD
-                         fix reported channel count for PNG & BMP
-                         re-enable SSE2 in non-gcc 64-bit
-                         support RGB-formatted JPEG
-                         read 16-bit PNGs (only as 8-bit)
-      2.10  (2016-01-22) avoid warning introduced in 2.09 by STBI_REALLOC_SIZED
-      2.09  (2016-01-16) allow comments in PNM files
-                         16-bit-per-pixel TGA (not bit-per-component)
-                         info() for TGA could break due to .hdr handling
-                         info() for BMP to shares code instead of sloppy parse
-                         can use STBI_REALLOC_SIZED if allocator doesn't support realloc
-                         code cleanup
-      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
-      2.07  (2015-09-13) fix compiler warnings
-                         partial animated GIF support
-                         limited 16-bpc PSD support
-                         #ifdef unused functions
-                         bug with < 92 byte PIC,PNM,HDR,TGA
-      2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
-      2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
-      2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
-      2.03  (2015-04-12) extra corruption checking (mmozeiko)
-                         stbi_set_flip_vertically_on_load (nguillemot)
-                         fix NEON support; fix mingw support
-      2.02  (2015-01-19) fix incorrect assert, fix warning
-      2.01  (2015-01-17) fix various warnings; suppress SIMD on gcc 32-bit without -msse2
-      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
-      2.00  (2014-12-25) optimize JPG, including x86 SSE2 & NEON SIMD (ryg)
-                         progressive JPEG (stb)
-                         PGM/PPM support (Ken Miller)
-                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
-                         GIF bugfix -- seemingly never worked
-                         STBI_NO_*, STBI_ONLY_*
-      1.48  (2014-12-14) fix incorrectly-named assert()
-      1.47  (2014-12-14) 1/2/4-bit PNG support, both direct and paletted (Omar Cornut & stb)
-                         optimize PNG (ryg)
-                         fix bug in interlaced PNG with user-specified channel count (stb)
-      1.46  (2014-08-26)
-              fix broken tRNS chunk (colorkey-style transparency) in non-paletted PNG
-      1.45  (2014-08-16)
-              fix MSVC-ARM internal compiler error by wrapping malloc
-      1.44  (2014-08-07)
-              various warning fixes from Ronny Chevalier
-      1.43  (2014-07-15)
-              fix MSVC-only compiler problem in code changed in 1.42
-      1.42  (2014-07-09)
-              don't define _CRT_SECURE_NO_WARNINGS (affects user code)
-              fixes to stbi__cleanup_jpeg path
-              added STBI_ASSERT to avoid requiring assert.h
-      1.41  (2014-06-25)
-              fix search&replace from 1.36 that messed up comments/error messages
-      1.40  (2014-06-22)
-              fix gcc struct-initialization warning
-      1.39  (2014-06-15)
-              fix to TGA optimization when req_comp != number of components in TGA;
-              fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my test suite)
-              add support for BMP version 5 (more ignored fields)
-      1.38  (2014-06-06)
-              suppress MSVC warnings on integer casts truncating values
-              fix accidental rename of 'skip' field of I/O
-      1.37  (2014-06-04)
-              remove duplicate typedef
-      1.36  (2014-06-03)
-              convert to header file single-file library
-              if de-iphone isn't set, load iphone images color-swapped instead of returning NULL
-      1.35  (2014-05-27)
-              various warnings
-              fix broken STBI_SIMD path
-              fix bug where stbi_load_from_file no longer left file pointer in correct place
-              fix broken non-easy path for 32-bit BMP (possibly never used)
-              TGA optimization by Arseny Kapoulkine
-      1.34  (unknown)
-              use STBI_NOTUSED in stbi__resample_row_generic(), fix one more leak in tga failure case
-      1.33  (2011-07-14)
-              make stbi_is_hdr work in STBI_NO_HDR (as specified), minor compiler-friendly improvements
-      1.32  (2011-07-13)
-              support for "info" function for all supported filetypes (SpartanJ)
-      1.31  (2011-06-20)
-              a few more leak fixes, bug in PNG handling (SpartanJ)
-      1.30  (2011-06-11)
-              added ability to load files via callbacks to accomidate custom input streams (Ben Wenger)
-              removed deprecated format-specific test/load functions
-              removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks anyway
-              error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha)
-              fix inefficiency in decoding 32-bit BMP (David Woo)
-      1.29  (2010-08-16)
-              various warning fixes from Aurelien Pocheville
-      1.28  (2010-08-01)
-              fix bug in GIF palette transparency (SpartanJ)
-      1.27  (2010-08-01)
-              cast-to-stbi_uc to fix warnings
-      1.26  (2010-07-24)
-              fix bug in file buffering for PNG reported by SpartanJ
-      1.25  (2010-07-17)
-              refix trans_data warning (Won Chun)
-      1.24  (2010-07-12)
-              perf improvements reading from files on platforms with lock-heavy fgetc()
-              minor perf improvements for jpeg
-              deprecated type-specific functions so we'll get feedback if they're needed
-              attempt to fix trans_data warning (Won Chun)
-      1.23    fixed bug in iPhone support
-      1.22  (2010-07-10)
-              removed image *writing* support
-              stbi_info support from Jetro Lauha
-              GIF support from Jean-Marc Lienher
-              iPhone PNG-extensions from James Brown
-              warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err. Janez (U+017D)emva)
-      1.21    fix use of 'stbi_uc' in header (reported by jon blow)
-      1.20    added support for Softimage PIC, by Tom Seddon
-      1.19    bug in interlaced PNG corruption check (found by ryg)
-      1.18  (2008-08-02)
-              fix a threading bug (local mutable static)
-      1.17    support interlaced PNG
-      1.16    major bugfix - stbi__convert_format converted one too many pixels
-      1.15    initialize some fields for thread safety
-      1.14    fix threadsafe conversion bug
-              header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
-      1.13    threadsafe
-      1.12    const qualifiers in the API
-      1.11    Support installable IDCT, colorspace conversion routines
-      1.10    Fixes for 64-bit (don't use "unsigned long")
-              optimized upsampling by Fabian "ryg" Giesen
-      1.09    Fix format-conversion for PSD code (bad global variables!)
-      1.08    Thatcher Ulrich's PSD code integrated by Nicolas Schulz
-      1.07    attempt to fix C++ warning/errors again
-      1.06    attempt to fix C++ warning/errors again
-      1.05    fix TGA loading to return correct *comp and use good luminance calc
-      1.04    default float alpha is 1, not 255; use 'void *' for stbi_image_free
-      1.03    bugfixes to STBI_NO_STDIO, STBI_NO_HDR
-      1.02    support for (subset of) HDR files, float interface for preferred access to them
-      1.01    fix bug: possible bug in handling right-side up bmps... not sure
-              fix bug: the stbi__bmp_load() and stbi__tga_load() functions didn't work at all
-      1.00    interface to zlib that skips zlib header
-      0.99    correct handling of alpha in palette
-      0.98    TGA loader by lonesock; dynamically add loaders (untested)
-      0.97    jpeg errors on too large a file; also catch another malloc failure
-      0.96    fix detection of invalid v value - particleman@mollyrocket forum
-      0.95    during header scan, seek to markers in case of padding
-      0.94    STBI_NO_STDIO to disable stdio usage; rename all #defines the same
-      0.93    handle jpegtran output; verbose errors
-      0.92    read 4,8,16,24,32-bit BMP files of several formats
-      0.91    output 24-bit Windows 3.0 BMP files
-      0.90    fix a few more warnings; bump version number to approach 1.0
-      0.61    bugfixes due to Marc LeBlanc, Christopher Lloyd
-      0.60    fix compiling as c++
-      0.59    fix warnings: merge Dave Moore's -Wall fixes
-      0.58    fix bug: zlib uncompressed mode len/nlen was wrong endian
-      0.57    fix bug: jpg last huffman symbol before marker was >9 bits but less than 16 available
-      0.56    fix bug: zlib uncompressed mode len vs. nlen
-      0.55    fix bug: restart_interval not initialized to 0
-      0.54    allow NULL for 'int *comp'
-      0.53    fix bug in png 3->4; speedup png decoding
-      0.52    png handles req_comp=3,4 directly; minor cleanup; jpeg comments
-      0.51    obey req_comp requests, 1-component jpegs return as 1-component,
-              on 'test' only check type, not whether we support this variant
-      0.50  (2006-11-19)
-              first released version
-*/
-
-
-/*
-------------------------------------------------------------------------------
-This software is available under 2 licenses -- choose whichever you prefer.
-------------------------------------------------------------------------------
-ALTERNATIVE A - MIT License
-Copyright (c) 2017 Sean Barrett
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-------------------------------------------------------------------------------
-ALTERNATIVE B - Public Domain (www.unlicense.org)
-This is free and unencumbered software released into the public domain.
-Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
-software, either in source code form or as a compiled binary, for any purpose,
-commercial or non-commercial, and by any means.
-In jurisdictions that recognize copyright laws, the author or authors of this
-software dedicate any and all copyright interest in the software to the public
-domain. We make this dedication for the benefit of the public at large and to
-the detriment of our heirs and successors. We intend this dedication to be an
-overt act of relinquishment in perpetuity of all present and future rights to
-this software under copyright law.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-------------------------------------------------------------------------------
-*/

+ 0 - 1724
ggml/examples/stb_image_write.h

@@ -1,1724 +0,0 @@
-/* stb_image_write - v1.16 - public domain - http://nothings.org/stb
-   writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
-                                     no warranty implied; use at your own risk
-
-   Before #including,
-
-       #define STB_IMAGE_WRITE_IMPLEMENTATION
-
-   in the file that you want to have the implementation.
-
-   Will probably not work correctly with strict-aliasing optimizations.
-
-ABOUT:
-
-   This header file is a library for writing images to C stdio or a callback.
-
-   The PNG output is not optimal; it is 20-50% larger than the file
-   written by a decent optimizing implementation; though providing a custom
-   zlib compress function (see STBIW_ZLIB_COMPRESS) can mitigate that.
-   This library is designed for source code compactness and simplicity,
-   not optimal image file size or run-time performance.
-
-BUILDING:
-
-   You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h.
-   You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace
-   malloc,realloc,free.
-   You can #define STBIW_MEMMOVE() to replace memmove()
-   You can #define STBIW_ZLIB_COMPRESS to use a custom zlib-style compress function
-   for PNG compression (instead of the builtin one), it must have the following signature:
-   unsigned char * my_compress(unsigned char *data, int data_len, int *out_len, int quality);
-   The returned data will be freed with STBIW_FREE() (free() by default),
-   so it must be heap allocated with STBIW_MALLOC() (malloc() by default),
-
-UNICODE:
-
-   If compiling for Windows and you wish to use Unicode filenames, compile
-   with
-       #define STBIW_WINDOWS_UTF8
-   and pass utf8-encoded filenames. Call stbiw_convert_wchar_to_utf8 to convert
-   Windows wchar_t filenames to utf8.
-
-USAGE:
-
-   There are five functions, one for each image file format:
-
-     int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes);
-     int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
-     int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
-     int stbi_write_jpg(char const *filename, int w, int h, int comp, const void *data, int quality);
-     int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
-
-     void stbi_flip_vertically_on_write(int flag); // flag is non-zero to flip data vertically
-
-   There are also five equivalent functions that use an arbitrary write function. You are
-   expected to open/close your file-equivalent before and after calling these:
-
-     int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
-     int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
-     int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
-     int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
-     int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality);
-
-   where the callback is:
-      void stbi_write_func(void *context, void *data, int size);
-
-   You can configure it with these global variables:
-      int stbi_write_tga_with_rle;             // defaults to true; set to 0 to disable RLE
-      int stbi_write_png_compression_level;    // defaults to 8; set to higher for more compression
-      int stbi_write_force_png_filter;         // defaults to -1; set to 0..5 to force a filter mode
-
-
-   You can define STBI_WRITE_NO_STDIO to disable the file variant of these
-   functions, so the library will not use stdio.h at all. However, this will
-   also disable HDR writing, because it requires stdio for formatted output.
-
-   Each function returns 0 on failure and non-0 on success.
-
-   The functions create an image file defined by the parameters. The image
-   is a rectangle of pixels stored from left-to-right, top-to-bottom.
-   Each pixel contains 'comp' channels of data stored interleaved with 8-bits
-   per channel, in the following order: 1=Y, 2=YA, 3=RGB, 4=RGBA. (Y is
-   monochrome color.) The rectangle is 'w' pixels wide and 'h' pixels tall.
-   The *data pointer points to the first byte of the top-left-most pixel.
-   For PNG, "stride_in_bytes" is the distance in bytes from the first byte of
-   a row of pixels to the first byte of the next row of pixels.
-
-   PNG creates output files with the same number of components as the input.
-   The BMP format expands Y to RGB in the file format and does not
-   output alpha.
-
-   PNG supports writing rectangles of data even when the bytes storing rows of
-   data are not consecutive in memory (e.g. sub-rectangles of a larger image),
-   by supplying the stride between the beginning of adjacent rows. The other
-   formats do not. (Thus you cannot write a native-format BMP through the BMP
-   writer, both because it is in BGR order and because it may have padding
-   at the end of the line.)
-
-   PNG allows you to set the deflate compression level by setting the global
-   variable 'stbi_write_png_compression_level' (it defaults to 8).
-
-   HDR expects linear float data. Since the format is always 32-bit rgb(e)
-   data, alpha (if provided) is discarded, and for monochrome data it is
-   replicated across all three channels.
-
-   TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed
-   data, set the global variable 'stbi_write_tga_with_rle' to 0.
-
-   JPEG does ignore alpha channels in input data; quality is between 1 and 100.
-   Higher quality looks better but results in a bigger image.
-   JPEG baseline (no JPEG progressive).
-
-CREDITS:
-
-
-   Sean Barrett           -    PNG/BMP/TGA
-   Baldur Karlsson        -    HDR
-   Jean-Sebastien Guay    -    TGA monochrome
-   Tim Kelsey             -    misc enhancements
-   Alan Hickman           -    TGA RLE
-   Emmanuel Julien        -    initial file IO callback implementation
-   Jon Olick              -    original jo_jpeg.cpp code
-   Daniel Gibson          -    integrate JPEG, allow external zlib
-   Aarni Koskela          -    allow choosing PNG filter
-
-   bugfixes:
-      github:Chribba
-      Guillaume Chereau
-      github:jry2
-      github:romigrou
-      Sergio Gonzalez
-      Jonas Karlsson
-      Filip Wasil
-      Thatcher Ulrich
-      github:poppolopoppo
-      Patrick Boettcher
-      github:xeekworx
-      Cap Petschulat
-      Simon Rodriguez
-      Ivan Tikhonov
-      github:ignotion
-      Adam Schackart
-      Andrew Kensler
-
-LICENSE
-
-  See end of file for license information.
-
-*/
-
-#ifndef INCLUDE_STB_IMAGE_WRITE_H
-#define INCLUDE_STB_IMAGE_WRITE_H
-
-#include <stdlib.h>
-
-// if STB_IMAGE_WRITE_STATIC causes problems, try defining STBIWDEF to 'inline' or 'static inline'
-#ifndef STBIWDEF
-#ifdef STB_IMAGE_WRITE_STATIC
-#define STBIWDEF  static
-#else
-#ifdef __cplusplus
-#define STBIWDEF  extern "C"
-#else
-#define STBIWDEF  extern
-#endif
-#endif
-#endif
-
-#ifndef STB_IMAGE_WRITE_STATIC  // C++ forbids static forward declarations
-STBIWDEF int stbi_write_tga_with_rle;
-STBIWDEF int stbi_write_png_compression_level;
-STBIWDEF int stbi_write_force_png_filter;
-#endif
-
-#ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const void  *data, int stride_in_bytes);
-STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void  *data);
-STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void  *data);
-STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
-STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void  *data, int quality);
-
-#ifdef STBIW_WINDOWS_UTF8
-STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
-#endif
-#endif
-
-typedef void stbi_write_func(void *context, void *data, int size);
-
-STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
-STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
-STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
-STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
-STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void  *data, int quality);
-
-STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
-
-#endif//INCLUDE_STB_IMAGE_WRITE_H
-
-#ifdef STB_IMAGE_WRITE_IMPLEMENTATION
-
-#ifdef _WIN32
-   #ifndef _CRT_SECURE_NO_WARNINGS
-   #define _CRT_SECURE_NO_WARNINGS
-   #endif
-   #ifndef _CRT_NONSTDC_NO_DEPRECATE
-   #define _CRT_NONSTDC_NO_DEPRECATE
-   #endif
-#endif
-
-#ifndef STBI_WRITE_NO_STDIO
-#include <stdio.h>
-#endif // STBI_WRITE_NO_STDIO
-
-#include <stdarg.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-
-#if defined(STBIW_MALLOC) && defined(STBIW_FREE) && (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED))
-// ok
-#elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED)
-// ok
-#else
-#error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)."
-#endif
-
-#ifndef STBIW_MALLOC
-#define STBIW_MALLOC(sz)        malloc(sz)
-#define STBIW_REALLOC(p,newsz)  realloc(p,newsz)
-#define STBIW_FREE(p)           free(p)
-#endif
-
-#ifndef STBIW_REALLOC_SIZED
-#define STBIW_REALLOC_SIZED(p,oldsz,newsz) STBIW_REALLOC(p,newsz)
-#endif
-
-
-#ifndef STBIW_MEMMOVE
-#define STBIW_MEMMOVE(a,b,sz) memmove(a,b,sz)
-#endif
-
-
-#ifndef STBIW_ASSERT
-#include <assert.h>
-#define STBIW_ASSERT(x) assert(x)
-#endif
-
-#define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff)
-
-#ifdef STB_IMAGE_WRITE_STATIC
-static int stbi_write_png_compression_level = 8;
-static int stbi_write_tga_with_rle = 1;
-static int stbi_write_force_png_filter = -1;
-#else
-int stbi_write_png_compression_level = 8;
-int stbi_write_tga_with_rle = 1;
-int stbi_write_force_png_filter = -1;
-#endif
-
-static int stbi__flip_vertically_on_write = 0;
-
-STBIWDEF void stbi_flip_vertically_on_write(int flag)
-{
-   stbi__flip_vertically_on_write = flag;
-}
-
-typedef struct
-{
-   stbi_write_func *func;
-   void *context;
-   unsigned char buffer[64];
-   int buf_used;
-} stbi__write_context;
-
-// initialize a callback-based context
-static void stbi__start_write_callbacks(stbi__write_context *s, stbi_write_func *c, void *context)
-{
-   s->func    = c;
-   s->context = context;
-}
-
-#ifndef STBI_WRITE_NO_STDIO
-
-static void stbi__stdio_write(void *context, void *data, int size)
-{
-   fwrite(data,1,size,(FILE*) context);
-}
-
-#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
-#ifdef __cplusplus
-#define STBIW_EXTERN extern "C"
-#else
-#define STBIW_EXTERN extern
-#endif
-STBIW_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
-STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
-
-STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
-{
-   return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
-}
-#endif
-
-static FILE *stbiw__fopen(char const *filename, char const *mode)
-{
-   FILE *f;
-#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
-   wchar_t wMode[64];
-   wchar_t wFilename[1024];
-   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
-      return 0;
-
-   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
-      return 0;
-
-#if defined(_MSC_VER) && _MSC_VER >= 1400
-   if (0 != _wfopen_s(&f, wFilename, wMode))
-      f = 0;
-#else
-   f = _wfopen(wFilename, wMode);
-#endif
-
-#elif defined(_MSC_VER) && _MSC_VER >= 1400
-   if (0 != fopen_s(&f, filename, mode))
-      f=0;
-#else
-   f = fopen(filename, mode);
-#endif
-   return f;
-}
-
-static int stbi__start_write_file(stbi__write_context *s, const char *filename)
-{
-   FILE *f = stbiw__fopen(filename, "wb");
-   stbi__start_write_callbacks(s, stbi__stdio_write, (void *) f);
-   return f != NULL;
-}
-
-static void stbi__end_write_file(stbi__write_context *s)
-{
-   fclose((FILE *)s->context);
-}
-
-#endif // !STBI_WRITE_NO_STDIO
-
-typedef unsigned int stbiw_uint32;
-typedef int stb_image_write_test[sizeof(stbiw_uint32)==4 ? 1 : -1];
-
-static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v)
-{
-   while (*fmt) {
-      switch (*fmt++) {
-         case ' ': break;
-         case '1': { unsigned char x = STBIW_UCHAR(va_arg(v, int));
-                     s->func(s->context,&x,1);
-                     break; }
-         case '2': { int x = va_arg(v,int);
-                     unsigned char b[2];
-                     b[0] = STBIW_UCHAR(x);
-                     b[1] = STBIW_UCHAR(x>>8);
-                     s->func(s->context,b,2);
-                     break; }
-         case '4': { stbiw_uint32 x = va_arg(v,int);
-                     unsigned char b[4];
-                     b[0]=STBIW_UCHAR(x);
-                     b[1]=STBIW_UCHAR(x>>8);
-                     b[2]=STBIW_UCHAR(x>>16);
-                     b[3]=STBIW_UCHAR(x>>24);
-                     s->func(s->context,b,4);
-                     break; }
-         default:
-            STBIW_ASSERT(0);
-            return;
-      }
-   }
-}
-
-static void stbiw__writef(stbi__write_context *s, const char *fmt, ...)
-{
-   va_list v;
-   va_start(v, fmt);
-   stbiw__writefv(s, fmt, v);
-   va_end(v);
-}
-
-static void stbiw__write_flush(stbi__write_context *s)
-{
-   if (s->buf_used) {
-      s->func(s->context, &s->buffer, s->buf_used);
-      s->buf_used = 0;
-   }
-}
-
-static void stbiw__putc(stbi__write_context *s, unsigned char c)
-{
-   s->func(s->context, &c, 1);
-}
-
-static void stbiw__write1(stbi__write_context *s, unsigned char a)
-{
-   if ((size_t)s->buf_used + 1 > sizeof(s->buffer))
-      stbiw__write_flush(s);
-   s->buffer[s->buf_used++] = a;
-}
-
-static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c)
-{
-   int n;
-   if ((size_t)s->buf_used + 3 > sizeof(s->buffer))
-      stbiw__write_flush(s);
-   n = s->buf_used;
-   s->buf_used = n+3;
-   s->buffer[n+0] = a;
-   s->buffer[n+1] = b;
-   s->buffer[n+2] = c;
-}
-
-static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d)
-{
-   unsigned char bg[3] = { 255, 0, 255}, px[3];
-   int k;
-
-   if (write_alpha < 0)
-      stbiw__write1(s, d[comp - 1]);
-
-   switch (comp) {
-      case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
-      case 1:
-         if (expand_mono)
-            stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
-         else
-            stbiw__write1(s, d[0]);  // monochrome TGA
-         break;
-      case 4:
-         if (!write_alpha) {
-            // composite against pink background
-            for (k = 0; k < 3; ++k)
-               px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
-            stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
-            break;
-         }
-         /* FALLTHROUGH */
-      case 3:
-         stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
-         break;
-   }
-   if (write_alpha > 0)
-      stbiw__write1(s, d[comp - 1]);
-}
-
-static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
-{
-   stbiw_uint32 zero = 0;
-   int i,j, j_end;
-
-   if (y <= 0)
-      return;
-
-   if (stbi__flip_vertically_on_write)
-      vdir *= -1;
-
-   if (vdir < 0) {
-      j_end = -1; j = y-1;
-   } else {
-      j_end =  y; j = 0;
-   }
-
-   for (; j != j_end; j += vdir) {
-      for (i=0; i < x; ++i) {
-         unsigned char *d = (unsigned char *) data + (j*x+i)*comp;
-         stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
-      }
-      stbiw__write_flush(s);
-      s->func(s->context, &zero, scanline_pad);
-   }
-}
-
-static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...)
-{
-   if (y < 0 || x < 0) {
-      return 0;
-   } else {
-      va_list v;
-      va_start(v, fmt);
-      stbiw__writefv(s, fmt, v);
-      va_end(v);
-      stbiw__write_pixels(s,rgb_dir,vdir,x,y,comp,data,alpha,pad, expand_mono);
-      return 1;
-   }
-}
-
-static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data)
-{
-   if (comp != 4) {
-      // write RGB bitmap
-      int pad = (-x*3) & 3;
-      return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
-              "11 4 22 4" "4 44 22 444444",
-              'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40,  // file header
-               40, x,y, 1,24, 0,0,0,0,0,0);             // bitmap header
-   } else {
-      // RGBA bitmaps need a v4 header
-      // use BI_BITFIELDS mode with 32bpp and alpha mask
-      // (straight BI_RGB with alpha mask doesn't work in most readers)
-      return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *)data,1,0,
-         "11 4 22 4" "4 44 22 444444 4444 4 444 444 444 444",
-         'B', 'M', 14+108+x*y*4, 0, 0, 14+108, // file header
-         108, x,y, 1,32, 3,0,0,0,0,0, 0xff0000,0xff00,0xff,0xff000000u, 0, 0,0,0, 0,0,0, 0,0,0, 0,0,0); // bitmap V4 header
-   }
-}
-
-STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
-{
-   stbi__write_context s = { 0 };
-   stbi__start_write_callbacks(&s, func, context);
-   return stbi_write_bmp_core(&s, x, y, comp, data);
-}
-
-#ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
-{
-   stbi__write_context s = { 0 };
-   if (stbi__start_write_file(&s,filename)) {
-      int r = stbi_write_bmp_core(&s, x, y, comp, data);
-      stbi__end_write_file(&s);
-      return r;
-   } else
-      return 0;
-}
-#endif //!STBI_WRITE_NO_STDIO
-
-static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, void *data)
-{
-   int has_alpha = (comp == 2 || comp == 4);
-   int colorbytes = has_alpha ? comp-1 : comp;
-   int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
-
-   if (y < 0 || x < 0)
-      return 0;
-
-   if (!stbi_write_tga_with_rle) {
-      return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *) data, has_alpha, 0,
-         "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8);
-   } else {
-      int i,j,k;
-      int jend, jdir;
-
-      stbiw__writef(s, "111 221 2222 11", 0,0,format+8, 0,0,0, 0,0,x,y, (colorbytes + has_alpha) * 8, has_alpha * 8);
-
-      if (stbi__flip_vertically_on_write) {
-         j = 0;
-         jend = y;
-         jdir = 1;
-      } else {
-         j = y-1;
-         jend = -1;
-         jdir = -1;
-      }
-      for (; j != jend; j += jdir) {
-         unsigned char *row = (unsigned char *) data + j * x * comp;
-         int len;
-
-         for (i = 0; i < x; i += len) {
-            unsigned char *begin = row + i * comp;
-            int diff = 1;
-            len = 1;
-
-            if (i < x - 1) {
-               ++len;
-               diff = memcmp(begin, row + (i + 1) * comp, comp);
-               if (diff) {
-                  const unsigned char *prev = begin;
-                  for (k = i + 2; k < x && len < 128; ++k) {
-                     if (memcmp(prev, row + k * comp, comp)) {
-                        prev += comp;
-                        ++len;
-                     } else {
-                        --len;
-                        break;
-                     }
-                  }
-               } else {
-                  for (k = i + 2; k < x && len < 128; ++k) {
-                     if (!memcmp(begin, row + k * comp, comp)) {
-                        ++len;
-                     } else {
-                        break;
-                     }
-                  }
-               }
-            }
-
-            if (diff) {
-               unsigned char header = STBIW_UCHAR(len - 1);
-               stbiw__write1(s, header);
-               for (k = 0; k < len; ++k) {
-                  stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
-               }
-            } else {
-               unsigned char header = STBIW_UCHAR(len - 129);
-               stbiw__write1(s, header);
-               stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
-            }
-         }
-      }
-      stbiw__write_flush(s);
-   }
-   return 1;
-}
-
-STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
-{
-   stbi__write_context s = { 0 };
-   stbi__start_write_callbacks(&s, func, context);
-   return stbi_write_tga_core(&s, x, y, comp, (void *) data);
-}
-
-#ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
-{
-   stbi__write_context s = { 0 };
-   if (stbi__start_write_file(&s,filename)) {
-      int r = stbi_write_tga_core(&s, x, y, comp, (void *) data);
-      stbi__end_write_file(&s);
-      return r;
-   } else
-      return 0;
-}
-#endif
-
-// *************************************************************************************************
-// Radiance RGBE HDR writer
-// by Baldur Karlsson
-
-#define stbiw__max(a, b)  ((a) > (b) ? (a) : (b))
-
-#ifndef STBI_WRITE_NO_STDIO
-
-static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
-{
-   int exponent;
-   float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
-
-   if (maxcomp < 1e-32f) {
-      rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
-   } else {
-      float normalize = (float) frexp(maxcomp, &exponent) * 256.0f/maxcomp;
-
-      rgbe[0] = (unsigned char)(linear[0] * normalize);
-      rgbe[1] = (unsigned char)(linear[1] * normalize);
-      rgbe[2] = (unsigned char)(linear[2] * normalize);
-      rgbe[3] = (unsigned char)(exponent + 128);
-   }
-}
-
-static void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char databyte)
-{
-   unsigned char lengthbyte = STBIW_UCHAR(length+128);
-   STBIW_ASSERT(length+128 <= 255);
-   s->func(s->context, &lengthbyte, 1);
-   s->func(s->context, &databyte, 1);
-}
-
-static void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *data)
-{
-   unsigned char lengthbyte = STBIW_UCHAR(length);
-   STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code
-   s->func(s->context, &lengthbyte, 1);
-   s->func(s->context, data, length);
-}
-
-static void stbiw__write_hdr_scanline(stbi__write_context *s, int width, int ncomp, unsigned char *scratch, float *scanline)
-{
-   unsigned char scanlineheader[4] = { 2, 2, 0, 0 };
-   unsigned char rgbe[4];
-   float linear[3];
-   int x;
-
-   scanlineheader[2] = (width&0xff00)>>8;
-   scanlineheader[3] = (width&0x00ff);
-
-   /* skip RLE for images too small or large */
-   if (width < 8 || width >= 32768) {
-      for (x=0; x < width; x++) {
-         switch (ncomp) {
-            case 4: /* fallthrough */
-            case 3: linear[2] = scanline[x*ncomp + 2];
-                    linear[1] = scanline[x*ncomp + 1];
-                    linear[0] = scanline[x*ncomp + 0];
-                    break;
-            default:
-                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
-                    break;
-         }
-         stbiw__linear_to_rgbe(rgbe, linear);
-         s->func(s->context, rgbe, 4);
-      }
-   } else {
-      int c,r;
-      /* encode into scratch buffer */
-      for (x=0; x < width; x++) {
-         switch(ncomp) {
-            case 4: /* fallthrough */
-            case 3: linear[2] = scanline[x*ncomp + 2];
-                    linear[1] = scanline[x*ncomp + 1];
-                    linear[0] = scanline[x*ncomp + 0];
-                    break;
-            default:
-                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
-                    break;
-         }
-         stbiw__linear_to_rgbe(rgbe, linear);
-         scratch[x + width*0] = rgbe[0];
-         scratch[x + width*1] = rgbe[1];
-         scratch[x + width*2] = rgbe[2];
-         scratch[x + width*3] = rgbe[3];
-      }
-
-      s->func(s->context, scanlineheader, 4);
-
-      /* RLE each component separately */
-      for (c=0; c < 4; c++) {
-         unsigned char *comp = &scratch[width*c];
-
-         x = 0;
-         while (x < width) {
-            // find first run
-            r = x;
-            while (r+2 < width) {
-               if (comp[r] == comp[r+1] && comp[r] == comp[r+2])
-                  break;
-               ++r;
-            }
-            if (r+2 >= width)
-               r = width;
-            // dump up to first run
-            while (x < r) {
-               int len = r-x;
-               if (len > 128) len = 128;
-               stbiw__write_dump_data(s, len, &comp[x]);
-               x += len;
-            }
-            // if there's a run, output it
-            if (r+2 < width) { // same test as what we break out of in search loop, so only true if we break'd
-               // find next byte after run
-               while (r < width && comp[r] == comp[x])
-                  ++r;
-               // output run up to r
-               while (x < r) {
-                  int len = r-x;
-                  if (len > 127) len = 127;
-                  stbiw__write_run_data(s, len, comp[x]);
-                  x += len;
-               }
-            }
-         }
-      }
-   }
-}
-
-static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, float *data)
-{
-   if (y <= 0 || x <= 0 || data == NULL)
-      return 0;
-   else {
-      // Each component is stored separately. Allocate scratch space for full output scanline.
-      unsigned char *scratch = (unsigned char *) STBIW_MALLOC(x*4);
-      int i, len;
-      char buffer[128];
-      char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
-      s->func(s->context, header, sizeof(header)-1);
-
-#ifdef __STDC_LIB_EXT1__
-      len = sprintf_s(buffer, sizeof(buffer), "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
-#else
-      len = sprintf(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
-#endif
-      s->func(s->context, buffer, len);
-
-      for(i=0; i < y; i++)
-         stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp*x*(stbi__flip_vertically_on_write ? y-1-i : i));
-      STBIW_FREE(scratch);
-      return 1;
-   }
-}
-
-STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
-{
-   stbi__write_context s = { 0 };
-   stbi__start_write_callbacks(&s, func, context);
-   return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
-}
-
-STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
-{
-   stbi__write_context s = { 0 };
-   if (stbi__start_write_file(&s,filename)) {
-      int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data);
-      stbi__end_write_file(&s);
-      return r;
-   } else
-      return 0;
-}
-#endif // STBI_WRITE_NO_STDIO
-
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// PNG writer
-//
-
-#ifndef STBIW_ZLIB_COMPRESS
-// stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
-#define stbiw__sbraw(a) ((int *) (void *) (a) - 2)
-#define stbiw__sbm(a)   stbiw__sbraw(a)[0]
-#define stbiw__sbn(a)   stbiw__sbraw(a)[1]
-
-#define stbiw__sbneedgrow(a,n)  ((a)==0 || stbiw__sbn(a)+n >= stbiw__sbm(a))
-#define stbiw__sbmaybegrow(a,n) (stbiw__sbneedgrow(a,(n)) ? stbiw__sbgrow(a,n) : 0)
-#define stbiw__sbgrow(a,n)  stbiw__sbgrowf((void **) &(a), (n), sizeof(*(a)))
-
-#define stbiw__sbpush(a, v)      (stbiw__sbmaybegrow(a,1), (a)[stbiw__sbn(a)++] = (v))
-#define stbiw__sbcount(a)        ((a) ? stbiw__sbn(a) : 0)
-#define stbiw__sbfree(a)         ((a) ? STBIW_FREE(stbiw__sbraw(a)),0 : 0)
-
-static void *stbiw__sbgrowf(void **arr, int increment, int itemsize)
-{
-   int m = *arr ? 2*stbiw__sbm(*arr)+increment : increment+1;
-   void *p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr)*itemsize + sizeof(int)*2) : 0, itemsize * m + sizeof(int)*2);
-   STBIW_ASSERT(p);
-   if (p) {
-      if (!*arr) ((int *) p)[1] = 0;
-      *arr = (void *) ((int *) p + 2);
-      stbiw__sbm(*arr) = m;
-   }
-   return *arr;
-}
-
-static unsigned char *stbiw__zlib_flushf(unsigned char *data, unsigned int *bitbuffer, int *bitcount)
-{
-   while (*bitcount >= 8) {
-      stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
-      *bitbuffer >>= 8;
-      *bitcount -= 8;
-   }
-   return data;
-}
-
-static int stbiw__zlib_bitrev(int code, int codebits)
-{
-   int res=0;
-   while (codebits--) {
-      res = (res << 1) | (code & 1);
-      code >>= 1;
-   }
-   return res;
-}
-
-static unsigned int stbiw__zlib_countm(unsigned char *a, unsigned char *b, int limit)
-{
-   int i;
-   for (i=0; i < limit && i < 258; ++i)
-      if (a[i] != b[i]) break;
-   return i;
-}
-
-static unsigned int stbiw__zhash(unsigned char *data)
-{
-   stbiw_uint32 hash = data[0] + (data[1] << 8) + (data[2] << 16);
-   hash ^= hash << 3;
-   hash += hash >> 5;
-   hash ^= hash << 4;
-   hash += hash >> 17;
-   hash ^= hash << 25;
-   hash += hash >> 6;
-   return hash;
-}
-
-#define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount))
-#define stbiw__zlib_add(code,codebits) \
-      (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush())
-#define stbiw__zlib_huffa(b,c)  stbiw__zlib_add(stbiw__zlib_bitrev(b,c),c)
-// default huffman tables
-#define stbiw__zlib_huff1(n)  stbiw__zlib_huffa(0x30 + (n), 8)
-#define stbiw__zlib_huff2(n)  stbiw__zlib_huffa(0x190 + (n)-144, 9)
-#define stbiw__zlib_huff3(n)  stbiw__zlib_huffa(0 + (n)-256,7)
-#define stbiw__zlib_huff4(n)  stbiw__zlib_huffa(0xc0 + (n)-280,8)
-#define stbiw__zlib_huff(n)  ((n) <= 143 ? stbiw__zlib_huff1(n) : (n) <= 255 ? stbiw__zlib_huff2(n) : (n) <= 279 ? stbiw__zlib_huff3(n) : stbiw__zlib_huff4(n))
-#define stbiw__zlib_huffb(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n))
-
-#define stbiw__ZHASH   16384
-
-#endif // STBIW_ZLIB_COMPRESS
-
-STBIWDEF unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality)
-{
-#ifdef STBIW_ZLIB_COMPRESS
-   // user provided a zlib compress implementation, use that
-   return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality);
-#else // use builtin
-   static unsigned short lengthc[] = { 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258, 259 };
-   static unsigned char  lengtheb[]= { 0,0,0,0,0,0,0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,  4,  5,  5,  5,  5,  0 };
-   static unsigned short distc[]   = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577, 32768 };
-   static unsigned char  disteb[]  = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 };
-   unsigned int bitbuf=0;
-   int i,j, bitcount=0;
-   unsigned char *out = NULL;
-   unsigned char ***hash_table = (unsigned char***) STBIW_MALLOC(stbiw__ZHASH * sizeof(unsigned char**));
-   if (hash_table == NULL)
-      return NULL;
-   if (quality < 5) quality = 5;
-
-   stbiw__sbpush(out, 0x78);   // DEFLATE 32K window
-   stbiw__sbpush(out, 0x5e);   // FLEVEL = 1
-   stbiw__zlib_add(1,1);  // BFINAL = 1
-   stbiw__zlib_add(1,2);  // BTYPE = 1 -- fixed huffman
-
-   for (i=0; i < stbiw__ZHASH; ++i)
-      hash_table[i] = NULL;
-
-   i=0;
-   while (i < data_len-3) {
-      // hash next 3 bytes of data to be compressed
-      int h = stbiw__zhash(data+i)&(stbiw__ZHASH-1), best=3;
-      unsigned char *bestloc = 0;
-      unsigned char **hlist = hash_table[h];
-      int n = stbiw__sbcount(hlist);
-      for (j=0; j < n; ++j) {
-         if (hlist[j]-data > i-32768) { // if entry lies within window
-            int d = stbiw__zlib_countm(hlist[j], data+i, data_len-i);
-            if (d >= best) { best=d; bestloc=hlist[j]; }
-         }
-      }
-      // when hash table entry is too long, delete half the entries
-      if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2*quality) {
-         STBIW_MEMMOVE(hash_table[h], hash_table[h]+quality, sizeof(hash_table[h][0])*quality);
-         stbiw__sbn(hash_table[h]) = quality;
-      }
-      stbiw__sbpush(hash_table[h],data+i);
-
-      if (bestloc) {
-         // "lazy matching" - check match at *next* byte, and if it's better, do cur byte as literal
-         h = stbiw__zhash(data+i+1)&(stbiw__ZHASH-1);
-         hlist = hash_table[h];
-         n = stbiw__sbcount(hlist);
-         for (j=0; j < n; ++j) {
-            if (hlist[j]-data > i-32767) {
-               int e = stbiw__zlib_countm(hlist[j], data+i+1, data_len-i-1);
-               if (e > best) { // if next match is better, bail on current match
-                  bestloc = NULL;
-                  break;
-               }
-            }
-         }
-      }
-
-      if (bestloc) {
-         int d = (int) (data+i - bestloc); // distance back
-         STBIW_ASSERT(d <= 32767 && best <= 258);
-         for (j=0; best > lengthc[j+1]-1; ++j);
-         stbiw__zlib_huff(j+257);
-         if (lengtheb[j]) stbiw__zlib_add(best - lengthc[j], lengtheb[j]);
-         for (j=0; d > distc[j+1]-1; ++j);
-         stbiw__zlib_add(stbiw__zlib_bitrev(j,5),5);
-         if (disteb[j]) stbiw__zlib_add(d - distc[j], disteb[j]);
-         i += best;
-      } else {
-         stbiw__zlib_huffb(data[i]);
-         ++i;
-      }
-   }
-   // write out final bytes
-   for (;i < data_len; ++i)
-      stbiw__zlib_huffb(data[i]);
-   stbiw__zlib_huff(256); // end of block
-   // pad with 0 bits to byte boundary
-   while (bitcount)
-      stbiw__zlib_add(0,1);
-
-   for (i=0; i < stbiw__ZHASH; ++i)
-      (void) stbiw__sbfree(hash_table[i]);
-   STBIW_FREE(hash_table);
-
-   // store uncompressed instead if compression was worse
-   if (stbiw__sbn(out) > data_len + 2 + ((data_len+32766)/32767)*5) {
-      stbiw__sbn(out) = 2;  // truncate to DEFLATE 32K window and FLEVEL = 1
-      for (j = 0; j < data_len;) {
-         int blocklen = data_len - j;
-         if (blocklen > 32767) blocklen = 32767;
-         stbiw__sbpush(out, data_len - j == blocklen); // BFINAL = ?, BTYPE = 0 -- no compression
-         stbiw__sbpush(out, STBIW_UCHAR(blocklen)); // LEN
-         stbiw__sbpush(out, STBIW_UCHAR(blocklen >> 8));
-         stbiw__sbpush(out, STBIW_UCHAR(~blocklen)); // NLEN
-         stbiw__sbpush(out, STBIW_UCHAR(~blocklen >> 8));
-         memcpy(out+stbiw__sbn(out), data+j, blocklen);
-         stbiw__sbn(out) += blocklen;
-         j += blocklen;
-      }
-   }
-
-   {
-      // compute adler32 on input
-      unsigned int s1=1, s2=0;
-      int blocklen = (int) (data_len % 5552);
-      j=0;
-      while (j < data_len) {
-         for (i=0; i < blocklen; ++i) { s1 += data[j+i]; s2 += s1; }
-         s1 %= 65521; s2 %= 65521;
-         j += blocklen;
-         blocklen = 5552;
-      }
-      stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8));
-      stbiw__sbpush(out, STBIW_UCHAR(s2));
-      stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8));
-      stbiw__sbpush(out, STBIW_UCHAR(s1));
-   }
-   *out_len = stbiw__sbn(out);
-   // make returned pointer freeable
-   STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
-   return (unsigned char *) stbiw__sbraw(out);
-#endif // STBIW_ZLIB_COMPRESS
-}
-
-static unsigned int stbiw__crc32(unsigned char *buffer, int len)
-{
-#ifdef STBIW_CRC32
-    return STBIW_CRC32(buffer, len);
-#else
-   static unsigned int crc_table[256] =
-   {
-      0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
-      0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
-      0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
-      0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
-      0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
-      0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
-      0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
-      0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
-      0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
-      0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
-      0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
-      0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
-      0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
-      0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
-      0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
-      0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
-      0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
-      0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
-      0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
-      0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
-      0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
-      0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
-      0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
-      0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
-      0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
-      0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
-      0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
-      0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
-      0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
-      0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
-      0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
-      0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
-   };
-
-   unsigned int crc = ~0u;
-   int i;
-   for (i=0; i < len; ++i)
-      crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
-   return ~crc;
-#endif
-}
-
-#define stbiw__wpng4(o,a,b,c,d) ((o)[0]=STBIW_UCHAR(a),(o)[1]=STBIW_UCHAR(b),(o)[2]=STBIW_UCHAR(c),(o)[3]=STBIW_UCHAR(d),(o)+=4)
-#define stbiw__wp32(data,v) stbiw__wpng4(data, (v)>>24,(v)>>16,(v)>>8,(v));
-#define stbiw__wptag(data,s) stbiw__wpng4(data, s[0],s[1],s[2],s[3])
-
-static void stbiw__wpcrc(unsigned char **data, int len)
-{
-   unsigned int crc = stbiw__crc32(*data - len - 4, len+4);
-   stbiw__wp32(*data, crc);
-}
-
-static unsigned char stbiw__paeth(int a, int b, int c)
-{
-   int p = a + b - c, pa = abs(p-a), pb = abs(p-b), pc = abs(p-c);
-   if (pa <= pb && pa <= pc) return STBIW_UCHAR(a);
-   if (pb <= pc) return STBIW_UCHAR(b);
-   return STBIW_UCHAR(c);
-}
-
-// @OPTIMIZE: provide an option that always forces left-predict or paeth predict
-static void stbiw__encode_png_line(unsigned char *pixels, int stride_bytes, int width, int height, int y, int n, int filter_type, signed char *line_buffer)
-{
-   static int mapping[] = { 0,1,2,3,4 };
-   static int firstmap[] = { 0,1,0,5,6 };
-   int *mymap = (y != 0) ? mapping : firstmap;
-   int i;
-   int type = mymap[filter_type];
-   unsigned char *z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height-1-y : y);
-   int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
-
-   if (type==0) {
-      memcpy(line_buffer, z, width*n);
-      return;
-   }
-
-   // first loop isn't optimized since it's just one pixel
-   for (i = 0; i < n; ++i) {
-      switch (type) {
-         case 1: line_buffer[i] = z[i]; break;
-         case 2: line_buffer[i] = z[i] - z[i-signed_stride]; break;
-         case 3: line_buffer[i] = z[i] - (z[i-signed_stride]>>1); break;
-         case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-signed_stride],0)); break;
-         case 5: line_buffer[i] = z[i]; break;
-         case 6: line_buffer[i] = z[i]; break;
-      }
-   }
-   switch (type) {
-      case 1: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-n]; break;
-      case 2: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-signed_stride]; break;
-      case 3: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - ((z[i-n] + z[i-signed_stride])>>1); break;
-      case 4: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-signed_stride], z[i-signed_stride-n]); break;
-      case 5: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - (z[i-n]>>1); break;
-      case 6: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break;
-   }
-}
-
-STBIWDEF unsigned char *stbi_write_png_to_mem(const unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len)
-{
-   int force_filter = stbi_write_force_png_filter;
-   int ctype[5] = { -1, 0, 4, 2, 6 };
-   unsigned char sig[8] = { 137,80,78,71,13,10,26,10 };
-   unsigned char *out,*o, *filt, *zlib;
-   signed char *line_buffer;
-   int j,zlen;
-
-   if (stride_bytes == 0)
-      stride_bytes = x * n;
-
-   if (force_filter >= 5) {
-      force_filter = -1;
-   }
-
-   filt = (unsigned char *) STBIW_MALLOC((x*n+1) * y); if (!filt) return 0;
-   line_buffer = (signed char *) STBIW_MALLOC(x * n); if (!line_buffer) { STBIW_FREE(filt); return 0; }
-   for (j=0; j < y; ++j) {
-      int filter_type;
-      if (force_filter > -1) {
-         filter_type = force_filter;
-         stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, force_filter, line_buffer);
-      } else { // Estimate the best filter by running through all of them:
-         int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
-         for (filter_type = 0; filter_type < 5; filter_type++) {
-            stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, filter_type, line_buffer);
-
-            // Estimate the entropy of the line using this filter; the less, the better.
-            est = 0;
-            for (i = 0; i < x*n; ++i) {
-               est += abs((signed char) line_buffer[i]);
-            }
-            if (est < best_filter_val) {
-               best_filter_val = est;
-               best_filter = filter_type;
-            }
-         }
-         if (filter_type != best_filter) {  // If the last iteration already got us the best filter, don't redo it
-            stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, best_filter, line_buffer);
-            filter_type = best_filter;
-         }
-      }
-      // when we get here, filter_type contains the filter type, and line_buffer contains the data
-      filt[j*(x*n+1)] = (unsigned char) filter_type;
-      STBIW_MEMMOVE(filt+j*(x*n+1)+1, line_buffer, x*n);
-   }
-   STBIW_FREE(line_buffer);
-   zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, stbi_write_png_compression_level);
-   STBIW_FREE(filt);
-   if (!zlib) return 0;
-
-   // each tag requires 12 bytes of overhead
-   out = (unsigned char *) STBIW_MALLOC(8 + 12+13 + 12+zlen + 12);
-   if (!out) return 0;
-   *out_len = 8 + 12+13 + 12+zlen + 12;
-
-   o=out;
-   STBIW_MEMMOVE(o,sig,8); o+= 8;
-   stbiw__wp32(o, 13); // header length
-   stbiw__wptag(o, "IHDR");
-   stbiw__wp32(o, x);
-   stbiw__wp32(o, y);
-   *o++ = 8;
-   *o++ = STBIW_UCHAR(ctype[n]);
-   *o++ = 0;
-   *o++ = 0;
-   *o++ = 0;
-   stbiw__wpcrc(&o,13);
-
-   stbiw__wp32(o, zlen);
-   stbiw__wptag(o, "IDAT");
-   STBIW_MEMMOVE(o, zlib, zlen);
-   o += zlen;
-   STBIW_FREE(zlib);
-   stbiw__wpcrc(&o, zlen);
-
-   stbiw__wp32(o,0);
-   stbiw__wptag(o, "IEND");
-   stbiw__wpcrc(&o,0);
-
-   STBIW_ASSERT(o == out + *out_len);
-
-   return out;
-}
-
-#ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes)
-{
-   FILE *f;
-   int len;
-   unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len);
-   if (png == NULL) return 0;
-
-   f = stbiw__fopen(filename, "wb");
-   if (!f) { STBIW_FREE(png); return 0; }
-   fwrite(png, 1, len, f);
-   fclose(f);
-   STBIW_FREE(png);
-   return 1;
-}
-#endif
-
-STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int stride_bytes)
-{
-   int len;
-   unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len);
-   if (png == NULL) return 0;
-   func(context, png, len);
-   STBIW_FREE(png);
-   return 1;
-}
-
-
-/* ***************************************************************************
- *
- * JPEG writer
- *
- * This is based on Jon Olick's jo_jpeg.cpp:
- * public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html
- */
-
-static const unsigned char stbiw__jpg_ZigZag[] = { 0,1,5,6,14,15,27,28,2,4,7,13,16,26,29,42,3,8,12,17,25,30,41,43,9,11,18,
-      24,31,40,44,53,10,19,23,32,39,45,52,54,20,22,33,38,46,51,55,60,21,34,37,47,50,56,59,61,35,36,48,49,57,58,62,63 };
-
-static void stbiw__jpg_writeBits(stbi__write_context *s, int *bitBufP, int *bitCntP, const unsigned short *bs) {
-   int bitBuf = *bitBufP, bitCnt = *bitCntP;
-   bitCnt += bs[1];
-   bitBuf |= bs[0] << (24 - bitCnt);
-   while(bitCnt >= 8) {
-      unsigned char c = (bitBuf >> 16) & 255;
-      stbiw__putc(s, c);
-      if(c == 255) {
-         stbiw__putc(s, 0);
-      }
-      bitBuf <<= 8;
-      bitCnt -= 8;
-   }
-   *bitBufP = bitBuf;
-   *bitCntP = bitCnt;
-}
-
-static void stbiw__jpg_DCT(float *d0p, float *d1p, float *d2p, float *d3p, float *d4p, float *d5p, float *d6p, float *d7p) {
-   float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p;
-   float z1, z2, z3, z4, z5, z11, z13;
-
-   float tmp0 = d0 + d7;
-   float tmp7 = d0 - d7;
-   float tmp1 = d1 + d6;
-   float tmp6 = d1 - d6;
-   float tmp2 = d2 + d5;
-   float tmp5 = d2 - d5;
-   float tmp3 = d3 + d4;
-   float tmp4 = d3 - d4;
-
-   // Even part
-   float tmp10 = tmp0 + tmp3;   // phase 2
-   float tmp13 = tmp0 - tmp3;
-   float tmp11 = tmp1 + tmp2;
-   float tmp12 = tmp1 - tmp2;
-
-   d0 = tmp10 + tmp11;       // phase 3
-   d4 = tmp10 - tmp11;
-
-   z1 = (tmp12 + tmp13) * 0.707106781f; // c4
-   d2 = tmp13 + z1;       // phase 5
-   d6 = tmp13 - z1;
-
-   // Odd part
-   tmp10 = tmp4 + tmp5;       // phase 2
-   tmp11 = tmp5 + tmp6;
-   tmp12 = tmp6 + tmp7;
-
-   // The rotator is modified from fig 4-8 to avoid extra negations.
-   z5 = (tmp10 - tmp12) * 0.382683433f; // c6
-   z2 = tmp10 * 0.541196100f + z5; // c2-c6
-   z4 = tmp12 * 1.306562965f + z5; // c2+c6
-   z3 = tmp11 * 0.707106781f; // c4
-
-   z11 = tmp7 + z3;      // phase 5
-   z13 = tmp7 - z3;
-
-   *d5p = z13 + z2;         // phase 6
-   *d3p = z13 - z2;
-   *d1p = z11 + z4;
-   *d7p = z11 - z4;
-
-   *d0p = d0;  *d2p = d2;  *d4p = d4;  *d6p = d6;
-}
-
-static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) {
-   int tmp1 = val < 0 ? -val : val;
-   val = val < 0 ? val-1 : val;
-   bits[1] = 1;
-   while(tmp1 >>= 1) {
-      ++bits[1];
-   }
-   bits[0] = val & ((1<<bits[1])-1);
-}
-
-static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt, float *CDU, int du_stride, float *fdtbl, int DC, const unsigned short HTDC[256][2], const unsigned short HTAC[256][2]) {
-   const unsigned short EOB[2] = { HTAC[0x00][0], HTAC[0x00][1] };
-   const unsigned short M16zeroes[2] = { HTAC[0xF0][0], HTAC[0xF0][1] };
-   int dataOff, i, j, n, diff, end0pos, x, y;
-   int DU[64];
-
-   // DCT rows
-   for(dataOff=0, n=du_stride*8; dataOff<n; dataOff+=du_stride) {
-      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+1], &CDU[dataOff+2], &CDU[dataOff+3], &CDU[dataOff+4], &CDU[dataOff+5], &CDU[dataOff+6], &CDU[dataOff+7]);
-   }
-   // DCT columns
-   for(dataOff=0; dataOff<8; ++dataOff) {
-      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+du_stride], &CDU[dataOff+du_stride*2], &CDU[dataOff+du_stride*3], &CDU[dataOff+du_stride*4],
-                     &CDU[dataOff+du_stride*5], &CDU[dataOff+du_stride*6], &CDU[dataOff+du_stride*7]);
-   }
-   // Quantize/descale/zigzag the coefficients
-   for(y = 0, j=0; y < 8; ++y) {
-      for(x = 0; x < 8; ++x,++j) {
-         float v;
-         i = y*du_stride+x;
-         v = CDU[i]*fdtbl[j];
-         // DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
-         // ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway?
-         DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
-      }
-   }
-
-   // Encode DC
-   diff = DU[0] - DC;
-   if (diff == 0) {
-      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]);
-   } else {
-      unsigned short bits[2];
-      stbiw__jpg_calcBits(diff, bits);
-      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[bits[1]]);
-      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
-   }
-   // Encode ACs
-   end0pos = 63;
-   for(; (end0pos>0)&&(DU[end0pos]==0); --end0pos) {
-   }
-   // end0pos = first element in reverse order !=0
-   if(end0pos == 0) {
-      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
-      return DU[0];
-   }
-   for(i = 1; i <= end0pos; ++i) {
-      int startpos = i;
-      int nrzeroes;
-      unsigned short bits[2];
-      for (; DU[i]==0 && i<=end0pos; ++i) {
-      }
-      nrzeroes = i-startpos;
-      if ( nrzeroes >= 16 ) {
-         int lng = nrzeroes>>4;
-         int nrmarker;
-         for (nrmarker=1; nrmarker <= lng; ++nrmarker)
-            stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes);
-         nrzeroes &= 15;
-      }
-      stbiw__jpg_calcBits(DU[i], bits);
-      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes<<4)+bits[1]]);
-      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
-   }
-   if(end0pos != 63) {
-      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
-   }
-   return DU[0];
-}
-
-static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality) {
-   // Constants that don't pollute global namespace
-   static const unsigned char std_dc_luminance_nrcodes[] = {0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0};
-   static const unsigned char std_dc_luminance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
-   static const unsigned char std_ac_luminance_nrcodes[] = {0,0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,0x7d};
-   static const unsigned char std_ac_luminance_values[] = {
-      0x01,0x02,0x03,0x00,0x04,0x11,0x05,0x12,0x21,0x31,0x41,0x06,0x13,0x51,0x61,0x07,0x22,0x71,0x14,0x32,0x81,0x91,0xa1,0x08,
-      0x23,0x42,0xb1,0xc1,0x15,0x52,0xd1,0xf0,0x24,0x33,0x62,0x72,0x82,0x09,0x0a,0x16,0x17,0x18,0x19,0x1a,0x25,0x26,0x27,0x28,
-      0x29,0x2a,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,0x59,
-      0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x83,0x84,0x85,0x86,0x87,0x88,0x89,
-      0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,0xb5,0xb6,
-      0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xe1,0xe2,
-      0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
-   };
-   static const unsigned char std_dc_chrominance_nrcodes[] = {0,0,3,1,1,1,1,1,1,1,1,1,0,0,0,0,0};
-   static const unsigned char std_dc_chrominance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
-   static const unsigned char std_ac_chrominance_nrcodes[] = {0,0,2,1,2,4,4,3,4,7,5,4,4,0,1,2,0x77};
-   static const unsigned char std_ac_chrominance_values[] = {
-      0x00,0x01,0x02,0x03,0x11,0x04,0x05,0x21,0x31,0x06,0x12,0x41,0x51,0x07,0x61,0x71,0x13,0x22,0x32,0x81,0x08,0x14,0x42,0x91,
-      0xa1,0xb1,0xc1,0x09,0x23,0x33,0x52,0xf0,0x15,0x62,0x72,0xd1,0x0a,0x16,0x24,0x34,0xe1,0x25,0xf1,0x17,0x18,0x19,0x1a,0x26,
-      0x27,0x28,0x29,0x2a,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,
-      0x59,0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x82,0x83,0x84,0x85,0x86,0x87,
-      0x88,0x89,0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,
-      0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,
-      0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
-   };
-   // Huffman tables
-   static const unsigned short YDC_HT[256][2] = { {0,2},{2,3},{3,3},{4,3},{5,3},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9}};
-   static const unsigned short UVDC_HT[256][2] = { {0,2},{1,2},{2,2},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9},{1022,10},{2046,11}};
-   static const unsigned short YAC_HT[256][2] = {
-      {10,4},{0,2},{1,2},{4,3},{11,4},{26,5},{120,7},{248,8},{1014,10},{65410,16},{65411,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {12,4},{27,5},{121,7},{502,9},{2038,11},{65412,16},{65413,16},{65414,16},{65415,16},{65416,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {28,5},{249,8},{1015,10},{4084,12},{65417,16},{65418,16},{65419,16},{65420,16},{65421,16},{65422,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {58,6},{503,9},{4085,12},{65423,16},{65424,16},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {59,6},{1016,10},{65430,16},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {122,7},{2039,11},{65438,16},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {123,7},{4086,12},{65446,16},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {250,8},{4087,12},{65454,16},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {504,9},{32704,15},{65462,16},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {505,9},{65470,16},{65471,16},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {506,9},{65479,16},{65480,16},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {1017,10},{65488,16},{65489,16},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {1018,10},{65497,16},{65498,16},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {2040,11},{65506,16},{65507,16},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {65515,16},{65516,16},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {2041,11},{65525,16},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
-   };
-   static const unsigned short UVAC_HT[256][2] = {
-      {0,2},{1,2},{4,3},{10,4},{24,5},{25,5},{56,6},{120,7},{500,9},{1014,10},{4084,12},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {11,4},{57,6},{246,8},{501,9},{2038,11},{4085,12},{65416,16},{65417,16},{65418,16},{65419,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {26,5},{247,8},{1015,10},{4086,12},{32706,15},{65420,16},{65421,16},{65422,16},{65423,16},{65424,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {27,5},{248,8},{1016,10},{4087,12},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{65430,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {58,6},{502,9},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{65438,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {59,6},{1017,10},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{65446,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {121,7},{2039,11},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{65454,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {122,7},{2040,11},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{65462,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {249,8},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{65470,16},{65471,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {503,9},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{65479,16},{65480,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {504,9},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{65488,16},{65489,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {505,9},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{65497,16},{65498,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {506,9},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{65506,16},{65507,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {2041,11},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{65515,16},{65516,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {16352,14},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{65525,16},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {1018,10},{32707,15},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
-   };
-   static const int YQT[] = {16,11,10,16,24,40,51,61,12,12,14,19,26,58,60,55,14,13,16,24,40,57,69,56,14,17,22,29,51,87,80,62,18,22,
-                             37,56,68,109,103,77,24,35,55,64,81,104,113,92,49,64,78,87,103,121,120,101,72,92,95,98,112,100,103,99};
-   static const int UVQT[] = {17,18,24,47,99,99,99,99,18,21,26,66,99,99,99,99,24,26,56,99,99,99,99,99,47,66,99,99,99,99,99,99,
-                              99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99};
-   static const float aasf[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f,
-                                 1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f };
-
-   int row, col, i, k, subsample;
-   float fdtbl_Y[64], fdtbl_UV[64];
-   unsigned char YTable[64], UVTable[64];
-
-   if(!data || !width || !height || comp > 4 || comp < 1) {
-      return 0;
-   }
-
-   quality = quality ? quality : 90;
-   subsample = quality <= 90 ? 1 : 0;
-   quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
-   quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
-
-   for(i = 0; i < 64; ++i) {
-      int uvti, yti = (YQT[i]*quality+50)/100;
-      YTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (yti < 1 ? 1 : yti > 255 ? 255 : yti);
-      uvti = (UVQT[i]*quality+50)/100;
-      UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
-   }
-
-   for(row = 0, k = 0; row < 8; ++row) {
-      for(col = 0; col < 8; ++col, ++k) {
-         fdtbl_Y[k]  = 1 / (YTable [stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
-         fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
-      }
-   }
-
-   // Write Headers
-   {
-      static const unsigned char head0[] = { 0xFF,0xD8,0xFF,0xE0,0,0x10,'J','F','I','F',0,1,1,0,0,1,0,1,0,0,0xFF,0xDB,0,0x84,0 };
-      static const unsigned char head2[] = { 0xFF,0xDA,0,0xC,3,1,0,2,0x11,3,0x11,0,0x3F,0 };
-      const unsigned char head1[] = { 0xFF,0xC0,0,0x11,8,(unsigned char)(height>>8),STBIW_UCHAR(height),(unsigned char)(width>>8),STBIW_UCHAR(width),
-                                      3,1,(unsigned char)(subsample?0x22:0x11),0,2,0x11,1,3,0x11,1,0xFF,0xC4,0x01,0xA2,0 };
-      s->func(s->context, (void*)head0, sizeof(head0));
-      s->func(s->context, (void*)YTable, sizeof(YTable));
-      stbiw__putc(s, 1);
-      s->func(s->context, UVTable, sizeof(UVTable));
-      s->func(s->context, (void*)head1, sizeof(head1));
-      s->func(s->context, (void*)(std_dc_luminance_nrcodes+1), sizeof(std_dc_luminance_nrcodes)-1);
-      s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
-      stbiw__putc(s, 0x10); // HTYACinfo
-      s->func(s->context, (void*)(std_ac_luminance_nrcodes+1), sizeof(std_ac_luminance_nrcodes)-1);
-      s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values));
-      stbiw__putc(s, 1); // HTUDCinfo
-      s->func(s->context, (void*)(std_dc_chrominance_nrcodes+1), sizeof(std_dc_chrominance_nrcodes)-1);
-      s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values));
-      stbiw__putc(s, 0x11); // HTUACinfo
-      s->func(s->context, (void*)(std_ac_chrominance_nrcodes+1), sizeof(std_ac_chrominance_nrcodes)-1);
-      s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values));
-      s->func(s->context, (void*)head2, sizeof(head2));
-   }
-
-   // Encode 8x8 macroblocks
-   {
-      static const unsigned short fillBits[] = {0x7F, 7};
-      int DCY=0, DCU=0, DCV=0;
-      int bitBuf=0, bitCnt=0;
-      // comp == 2 is grey+alpha (alpha is ignored)
-      int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
-      const unsigned char *dataR = (const unsigned char *)data;
-      const unsigned char *dataG = dataR + ofsG;
-      const unsigned char *dataB = dataR + ofsB;
-      int x, y, pos;
-      if(subsample) {
-         for(y = 0; y < height; y += 16) {
-            for(x = 0; x < width; x += 16) {
-               float Y[256], U[256], V[256];
-               for(row = y, pos = 0; row < y+16; ++row) {
-                  // row >= height => use last input row
-                  int clamped_row = (row < height) ? row : height - 1;
-                  int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
-                  for(col = x; col < x+16; ++col, ++pos) {
-                     // if col >= width => use pixel from last input column
-                     int p = base_p + ((col < width) ? col : (width-1))*comp;
-                     float r = dataR[p], g = dataG[p], b = dataB[p];
-                     Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
-                     U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
-                     V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
-                  }
-               }
-               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+0,   16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
-               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+8,   16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
-               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+128, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
-               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+136, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
-
-               // subsample U,V
-               {
-                  float subU[64], subV[64];
-                  int yy, xx;
-                  for(yy = 0, pos = 0; yy < 8; ++yy) {
-                     for(xx = 0; xx < 8; ++xx, ++pos) {
-                        int j = yy*32+xx*2;
-                        subU[pos] = (U[j+0] + U[j+1] + U[j+16] + U[j+17]) * 0.25f;
-                        subV[pos] = (V[j+0] + V[j+1] + V[j+16] + V[j+17]) * 0.25f;
-                     }
-                  }
-                  DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subU, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
-                  DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subV, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
-               }
-            }
-         }
-      } else {
-         for(y = 0; y < height; y += 8) {
-            for(x = 0; x < width; x += 8) {
-               float Y[64], U[64], V[64];
-               for(row = y, pos = 0; row < y+8; ++row) {
-                  // row >= height => use last input row
-                  int clamped_row = (row < height) ? row : height - 1;
-                  int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
-                  for(col = x; col < x+8; ++col, ++pos) {
-                     // if col >= width => use pixel from last input column
-                     int p = base_p + ((col < width) ? col : (width-1))*comp;
-                     float r = dataR[p], g = dataG[p], b = dataB[p];
-                     Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
-                     U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
-                     V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
-                  }
-               }
-
-               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y, 8, fdtbl_Y,  DCY, YDC_HT, YAC_HT);
-               DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, U, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
-               DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, V, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
-            }
-         }
-      }
-
-      // Do the bit alignment of the EOI marker
-      stbiw__jpg_writeBits(s, &bitBuf, &bitCnt, fillBits);
-   }
-
-   // EOI
-   stbiw__putc(s, 0xFF);
-   stbiw__putc(s, 0xD9);
-
-   return 1;
-}
-
-STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality)
-{
-   stbi__write_context s = { 0 };
-   stbi__start_write_callbacks(&s, func, context);
-   return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
-}
-
-
-#ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
-{
-   stbi__write_context s = { 0 };
-   if (stbi__start_write_file(&s,filename)) {
-      int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
-      stbi__end_write_file(&s);
-      return r;
-   } else
-      return 0;
-}
-#endif
-
-#endif // STB_IMAGE_WRITE_IMPLEMENTATION
-
-/* Revision history
-      1.16  (2021-07-11)
-             make Deflate code emit uncompressed blocks when it would otherwise expand
-             support writing BMPs with alpha channel
-      1.15  (2020-07-13) unknown
-      1.14  (2020-02-02) updated JPEG writer to downsample chroma channels
-      1.13
-      1.12
-      1.11  (2019-08-11)
-
-      1.10  (2019-02-07)
-             support utf8 filenames in Windows; fix warnings and platform ifdefs
-      1.09  (2018-02-11)
-             fix typo in zlib quality API, improve STB_I_W_STATIC in C++
-      1.08  (2018-01-29)
-             add stbi__flip_vertically_on_write, external zlib, zlib quality, choose PNG filter
-      1.07  (2017-07-24)
-             doc fix
-      1.06 (2017-07-23)
-             writing JPEG (using Jon Olick's code)
-      1.05   ???
-      1.04 (2017-03-03)
-             monochrome BMP expansion
-      1.03   ???
-      1.02 (2016-04-02)
-             avoid allocating large structures on the stack
-      1.01 (2016-01-16)
-             STBIW_REALLOC_SIZED: support allocators with no realloc support
-             avoid race-condition in crc initialization
-             minor compile issues
-      1.00 (2015-09-14)
-             installable file IO function
-      0.99 (2015-09-13)
-             warning fixes; TGA rle support
-      0.98 (2015-04-08)
-             added STBIW_MALLOC, STBIW_ASSERT etc
-      0.97 (2015-01-18)
-             fixed HDR asserts, rewrote HDR rle logic
-      0.96 (2015-01-17)
-             add HDR output
-             fix monochrome BMP
-      0.95 (2014-08-17)
-             add monochrome TGA output
-      0.94 (2014-05-31)
-             rename private functions to avoid conflicts with stb_image.h
-      0.93 (2014-05-27)
-             warning fixes
-      0.92 (2010-08-01)
-             casts to unsigned char to fix warnings
-      0.91 (2010-07-17)
-             first public release
-      0.90   first internal release
-*/
-
-/*
-------------------------------------------------------------------------------
-This software is available under 2 licenses -- choose whichever you prefer.
-------------------------------------------------------------------------------
-ALTERNATIVE A - MIT License
-Copyright (c) 2017 Sean Barrett
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-------------------------------------------------------------------------------
-ALTERNATIVE B - Public Domain (www.unlicense.org)
-This is free and unencumbered software released into the public domain.
-Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
-software, either in source code form or as a compiled binary, for any purpose,
-commercial or non-commercial, and by any means.
-In jurisdictions that recognize copyright laws, the author or authors of this
-software dedicate any and all copyright interest in the software to the public
-domain. We make this dedication for the benefit of the public at large and to
-the detriment of our heirs and successors. We intend this dedication to be an
-overt act of relinquishment in perpetuity of all present and future rights to
-this software under copyright law.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-------------------------------------------------------------------------------
-*/

+ 0 - 22
ggml/examples/whisper/CMakeLists.txt

@@ -1,22 +0,0 @@
-#
-# whisper
-
-add_library(whisper-cpp
-    whisper.cpp
-    )
-
-target_link_libraries(whisper-cpp PRIVATE
-    ggml
-    )
-
-set(TEST_TARGET whisper)
-add_executable(${TEST_TARGET} main.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE whisper-cpp common)
-target_include_directories(${TEST_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..)
-
-#
-# whisper-quantize
-
-set(TEST_TARGET whisper-quantize)
-add_executable(${TEST_TARGET} quantize.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)

+ 0 - 29
ggml/examples/whisper/README.md

@@ -1,29 +0,0 @@
-# whisper
-
-Port of [OpenAI's Whisper](https://github.com/openai/whisper) ASR model in C/C++ using
-[ggml](https://github.com/ggerganov/ggml)
-
-## More info
-
-Checkout https://github.com/ggerganov/whisper.cpp
-
-## Memory usage
-
-| Model  | Disk   | Mem     |
-| ---    | ---    | ---     |
-| tiny   |  75 MB | ~280 MB |
-| base   | 142 MB | ~430 MB |
-| small  | 466 MB | ~1.0 GB |
-| medium | 1.5 GB | ~2.6 GB |
-| large  | 2.9 GB | ~4.7 GB |
-
-## ggml format
-
-The original models are converted to a custom binary format. This allows to pack everything needed into a single file:
-
-- model parameters
-- mel filters
-- vocabulary
-- weights
-
-For more details, see the conversion script [convert-pt-to-ggml.py](convert-pt-to-ggml.py)

+ 0 - 342
ggml/examples/whisper/convert-pt-to-ggml.py

@@ -1,342 +0,0 @@
-# Convert Whisper transformer model from PyTorch to ggml format
-#
-# Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
-#
-# You need to clone the original repo in ~/path/to/repo/whisper/
-#
-#  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
-#
-# It is used to various assets needed by the algorithm:
-#
-#  - tokenizer
-#  - mel filters
-#
-# Also, you need to have the original models in ~/.cache/whisper/
-# See the original repo for more details.
-#
-# This script loads the specified model and whisper assets and saves them in ggml format.
-# The output is a single binary file containing the following information:
-#
-#  - hparams
-#  - mel filters
-#  - tokenizer vocab
-#  - model variables
-#
-# For each variable, write the following:
-#
-#  - Number of dimensions (int)
-#  - Name length (int)
-#  - Dimensions (int[n_dims])
-#  - Name (char[name_length])
-#  - Data (float[n_dims])
-#
-
-import io
-import os
-import sys
-import struct
-import json
-import code
-import torch
-import numpy as np
-import base64
-from pathlib import Path
-#from transformers import GPTJForCausalLM
-#from transformers import GPT2TokenizerFast
-
-# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
-#LANGUAGES = {
-#    "en": "english",
-#    "zh": "chinese",
-#    "de": "german",
-#    "es": "spanish",
-#    "ru": "russian",
-#    "ko": "korean",
-#    "fr": "french",
-#    "ja": "japanese",
-#    "pt": "portuguese",
-#    "tr": "turkish",
-#    "pl": "polish",
-#    "ca": "catalan",
-#    "nl": "dutch",
-#    "ar": "arabic",
-#    "sv": "swedish",
-#    "it": "italian",
-#    "id": "indonesian",
-#    "hi": "hindi",
-#    "fi": "finnish",
-#    "vi": "vietnamese",
-#    "iw": "hebrew",
-#    "uk": "ukrainian",
-#    "el": "greek",
-#    "ms": "malay",
-#    "cs": "czech",
-#    "ro": "romanian",
-#    "da": "danish",
-#    "hu": "hungarian",
-#    "ta": "tamil",
-#    "no": "norwegian",
-#    "th": "thai",
-#    "ur": "urdu",
-#    "hr": "croatian",
-#    "bg": "bulgarian",
-#    "lt": "lithuanian",
-#    "la": "latin",
-#    "mi": "maori",
-#    "ml": "malayalam",
-#    "cy": "welsh",
-#    "sk": "slovak",
-#    "te": "telugu",
-#    "fa": "persian",
-#    "lv": "latvian",
-#    "bn": "bengali",
-#    "sr": "serbian",
-#    "az": "azerbaijani",
-#    "sl": "slovenian",
-#    "kn": "kannada",
-#    "et": "estonian",
-#    "mk": "macedonian",
-#    "br": "breton",
-#    "eu": "basque",
-#    "is": "icelandic",
-#    "hy": "armenian",
-#    "ne": "nepali",
-#    "mn": "mongolian",
-#    "bs": "bosnian",
-#    "kk": "kazakh",
-#    "sq": "albanian",
-#    "sw": "swahili",
-#    "gl": "galician",
-#    "mr": "marathi",
-#    "pa": "punjabi",
-#    "si": "sinhala",
-#    "km": "khmer",
-#    "sn": "shona",
-#    "yo": "yoruba",
-#    "so": "somali",
-#    "af": "afrikaans",
-#    "oc": "occitan",
-#    "ka": "georgian",
-#    "be": "belarusian",
-#    "tg": "tajik",
-#    "sd": "sindhi",
-#    "gu": "gujarati",
-#    "am": "amharic",
-#    "yi": "yiddish",
-#    "lo": "lao",
-#    "uz": "uzbek",
-#    "fo": "faroese",
-#    "ht": "haitian creole",
-#    "ps": "pashto",
-#    "tk": "turkmen",
-#    "nn": "nynorsk",
-#    "mt": "maltese",
-#    "sa": "sanskrit",
-#    "lb": "luxembourgish",
-#    "my": "myanmar",
-#    "bo": "tibetan",
-#    "tl": "tagalog",
-#    "mg": "malagasy",
-#    "as": "assamese",
-#    "tt": "tatar",
-#    "haw": "hawaiian",
-#    "ln": "lingala",
-#    "ha": "hausa",
-#    "ba": "bashkir",
-#    "jw": "javanese",
-#    "su": "sundanese",
-#}
-
-## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
-#def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):
-#    os.environ["TOKENIZERS_PARALLELISM"] = "false"
-#    path = os.path.join(path_to_whisper_repo, "whisper/assets", name)
-#    tokenizer = GPT2TokenizerFast.from_pretrained(path)
-#
-#    specials = [
-#        "<|startoftranscript|>",
-#        *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
-#        "<|translate|>",
-#        "<|transcribe|>",
-#        "<|startoflm|>",
-#        "<|startofprev|>",
-#        "<|nocaptions|>",
-#        "<|notimestamps|>",
-#    ]
-#
-#    tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
-#    return tokenizer
-
-# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-if len(sys.argv) < 4:
-    print("Usage: convert-pt-to-ggml.py model.pt path-to-whisper-repo dir-output [use-f32]\n")
-    sys.exit(1)
-
-fname_inp   = Path(sys.argv[1])
-dir_whisper = Path(sys.argv[2])
-dir_out     = Path(sys.argv[3])
-
-# try to load PyTorch binary data
-try:
-    model_bytes = open(fname_inp, "rb").read()
-    with io.BytesIO(model_bytes) as fp:
-        checkpoint = torch.load(fp, map_location="cpu")
-except Exception:
-    print("Error: failed to load PyTorch model file:" , fname_inp)
-    sys.exit(1)
-
-hparams = checkpoint["dims"]
-print("hparams:", hparams)
-
-list_vars = checkpoint["model_state_dict"]
-
-#print(list_vars['encoder.positional_embedding'])
-#print(list_vars['encoder.conv1.weight'])
-#print(list_vars['encoder.conv1.weight'].shape)
-
-# load mel filters
-n_mels = hparams["n_mels"]
-with np.load(dir_whisper / "whisper" / "assets" / "mel_filters.npz") as f:
-    filters = torch.from_numpy(f[f"mel_{n_mels}"])
-    #print (filters)
-
-#code.interact(local=locals())
-
-# load tokenizer
-# for backwards compatibility, also check for older hf_transformers format tokenizer files
-# old format: dir_whisper/whisper/assets/[multilingual/gpt2]/vocab.json
-# new format: dir_whisper/whisper/assets/[multilingual/gpt2].tiktoken
-multilingual = hparams["n_vocab"] == 51865
-tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
-tokenizer_type = "tiktoken"
-if not tokenizer.is_file():
-    tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual" or "gpt2") / "vocab.json"
-    tokenizer_type = "hf_transformers"
-    if not tokenizer.is_file():
-        print("Error: failed to find either tiktoken or hf_transformers tokenizer file:", tokenizer)
-        sys.exit(1)
-
-byte_encoder = bytes_to_unicode()
-byte_decoder = {v:k for k, v in byte_encoder.items()}
-
-if tokenizer_type == "tiktoken":
-    with open(tokenizer, "rb") as f:
-        contents = f.read()
-        tokens = {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)}
-elif tokenizer_type == "hf_transformers":
-    with open(tokenizer, "r", encoding="utf8") as f:
-        _tokens_raw = json.load(f)
-        if '<|endoftext|>' in _tokens_raw:
-            # ensures exact same model as tokenizer_type == tiktoken
-            # details: https://github.com/ggerganov/whisper.cpp/pull/725
-            del _tokens_raw['<|endoftext|>']
-        tokens = {bytes([byte_decoder[c] for c in token]): int(idx) for token, idx in _tokens_raw.items()}
-
-# output in the same directory as the model
-fname_out = dir_out / "ggml-model.bin"
-
-# use 16-bit or 32-bit floats
-use_f16 = True
-if len(sys.argv) > 4:
-    use_f16 = False
-    fname_out = dir_out / "ggml-model-f32.bin"
-
-fout = fname_out.open("wb")
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-fout.write(struct.pack("i", hparams["n_vocab"]))
-fout.write(struct.pack("i", hparams["n_audio_ctx"]))
-fout.write(struct.pack("i", hparams["n_audio_state"]))
-fout.write(struct.pack("i", hparams["n_audio_head"]))
-fout.write(struct.pack("i", hparams["n_audio_layer"]))
-fout.write(struct.pack("i", hparams["n_text_ctx"]))
-fout.write(struct.pack("i", hparams["n_text_state"]))
-fout.write(struct.pack("i", hparams["n_text_head"]))
-fout.write(struct.pack("i", hparams["n_text_layer"]))
-fout.write(struct.pack("i", hparams["n_mels"]))
-fout.write(struct.pack("i", use_f16))
-
-# write mel filters
-fout.write(struct.pack("i", filters.shape[0]))
-fout.write(struct.pack("i", filters.shape[1]))
-for i in range(filters.shape[0]):
-    for j in range(filters.shape[1]):
-        fout.write(struct.pack("f", filters[i][j]))
-
-# write tokenizer
-fout.write(struct.pack("i", len(tokens)))
-
-for key in tokens:
-    fout.write(struct.pack("i", len(key)))
-    fout.write(key)
-
-for name in list_vars.keys():
-    data = list_vars[name].squeeze().numpy()
-    print("Processing variable: " , name ,  " with shape: ", data.shape)
-
-    # reshape conv bias from [n] to [n, 1]
-    if name in ["encoder.conv1.bias", "encoder.conv2.bias"]:
-        data = data.reshape(data.shape[0], 1)
-        print(f"  Reshaped variable: {name} to shape: ", data.shape)
-
-    n_dims = len(data.shape)
-
-    # looks like the whisper models are in f16 by default
-    # so we need to convert the small tensors to f32 until we fully support f16 in ggml
-    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype = 1
-    if use_f16:
-        if n_dims < 2 or \
-                name == "encoder.conv1.bias"   or \
-                name == "encoder.conv2.bias"   or \
-                name == "encoder.positional_embedding" or \
-                name == "decoder.positional_embedding":
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype = 0
-    else:
-        data = data.astype(np.float32)
-        ftype = 0
-
-    #if name.startswith("encoder"):
-    #    if name.endswith("mlp.0.weight") or \
-    #       name.endswith("mlp.2.weight"):
-    #        print("  Transposing")
-    #        data = data.transpose()
-
-    # header
-    str_ = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str_), ftype))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    fout.write(str_)
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " , fname_out)
-print("")

+ 0 - 1024
ggml/examples/whisper/main.cpp

@@ -1,1024 +0,0 @@
-#include "common.h"
-
-#include "whisper.h"
-
-#include <cmath>
-#include <fstream>
-#include <cstdio>
-#include <string>
-#include <thread>
-#include <vector>
-#include <cstring>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
-// Lowest is red, middle is yellow, highest is green.
-const std::vector<std::string> k_colors = {
-    "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
-    "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
-};
-
-//  500 -> 00:05.000
-// 6000 -> 01:00.000
-std::string to_timestamp(int64_t t, bool comma = false) {
-    int64_t msec = t * 10;
-    int64_t hr = msec / (1000 * 60 * 60);
-    msec = msec - hr * (1000 * 60 * 60);
-    int64_t min = msec / (1000 * 60);
-    msec = msec - min * (1000 * 60);
-    int64_t sec = msec / 1000;
-    msec = msec - sec * 1000;
-
-    char buf[32];
-    snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
-
-    return std::string(buf);
-}
-
-int timestamp_to_sample(int64_t t, int n_samples) {
-    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
-}
-
-// helper function to replace substrings
-void replace_all(std::string & s, const std::string & search, const std::string & replace) {
-    for (size_t pos = 0; ; pos += replace.length()) {
-        pos = s.find(search, pos);
-        if (pos == std::string::npos) break;
-        s.erase(pos, search.length());
-        s.insert(pos, replace);
-    }
-}
-
-// command-line parameters
-struct whisper_params {
-    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_processors =  1;
-    int32_t offset_t_ms  =  0;
-    int32_t offset_n     =  0;
-    int32_t duration_ms  =  0;
-    int32_t progress_step =  5;
-    int32_t max_context  = -1;
-    int32_t max_len      =  0;
-    int32_t best_of      =  2;
-    int32_t beam_size    = -1;
-
-    float word_thold    =  0.01f;
-    float entropy_thold =  2.40f;
-    float logprob_thold = -1.00f;
-
-    bool speed_up        = false;
-    bool debug_mode      = false;
-    bool translate       = false;
-    bool detect_language = false;
-    bool diarize         = false;
-    bool tinydiarize     = false;
-    bool split_on_word   = false;
-    bool no_fallback     = false;
-    bool output_txt      = false;
-    bool output_vtt      = false;
-    bool output_srt      = false;
-    bool output_wts      = false;
-    bool output_csv      = false;
-    bool output_jsn      = false;
-    bool output_lrc      = false;
-    bool print_special   = false;
-    bool print_colors    = false;
-    bool print_progress  = false;
-    bool no_timestamps   = false;
-    bool log_score       = false;
-
-    std::string language  = "en";
-    std::string prompt;
-    std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
-    std::string model     = "models/ggml-base.en.bin";
-
-    // [TDRZ] speaker turn string
-    std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line
-
-    std::string openvino_encode_device = "CPU";
-
-    std::vector<std::string> fname_inp = {};
-    std::vector<std::string> fname_out = {};
-};
-
-void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
-
-bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-
-        if (arg == "-"){
-            params.fname_inp.push_back(arg);
-            continue;
-        }
-
-        if (arg[0] != '-') {
-            params.fname_inp.push_back(arg);
-            continue;
-        }
-
-        if (arg == "-h" || arg == "--help") {
-            whisper_print_usage(argc, argv, params);
-            exit(0);
-        }
-        else if (arg == "-t"    || arg == "--threads")         { params.n_threads       = std::stoi(argv[++i]); }
-        else if (arg == "-p"    || arg == "--processors")      { params.n_processors    = std::stoi(argv[++i]); }
-        else if (arg == "-ot"   || arg == "--offset-t")        { params.offset_t_ms     = std::stoi(argv[++i]); }
-        else if (arg == "-on"   || arg == "--offset-n")        { params.offset_n        = std::stoi(argv[++i]); }
-        else if (arg == "-d"    || arg == "--duration")        { params.duration_ms     = std::stoi(argv[++i]); }
-        else if (arg == "-mc"   || arg == "--max-context")     { params.max_context     = std::stoi(argv[++i]); }
-        else if (arg == "-ml"   || arg == "--max-len")         { params.max_len         = std::stoi(argv[++i]); }
-        else if (arg == "-bo"   || arg == "--best-of")         { params.best_of         = std::stoi(argv[++i]); }
-        else if (arg == "-bs"   || arg == "--beam-size")       { params.beam_size       = std::stoi(argv[++i]); }
-        else if (arg == "-wt"   || arg == "--word-thold")      { params.word_thold      = std::stof(argv[++i]); }
-        else if (arg == "-et"   || arg == "--entropy-thold")   { params.entropy_thold   = std::stof(argv[++i]); }
-        else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(argv[++i]); }
-        // else if (arg == "-su"   || arg == "--speed-up")        { params.speed_up        = true; }
-        else if (arg == "-debug"|| arg == "--debug-mode")      { params.debug_mode      = true; }
-        else if (arg == "-tr"   || arg == "--translate")       { params.translate       = true; }
-        else if (arg == "-di"   || arg == "--diarize")         { params.diarize         = true; }
-        else if (arg == "-tdrz" || arg == "--tinydiarize")     { params.tinydiarize     = true; }
-        else if (arg == "-sow"  || arg == "--split-on-word")   { params.split_on_word   = true; }
-        else if (arg == "-nf"   || arg == "--no-fallback")     { params.no_fallback     = true; }
-        else if (arg == "-otxt" || arg == "--output-txt")      { params.output_txt      = true; }
-        else if (arg == "-ovtt" || arg == "--output-vtt")      { params.output_vtt      = true; }
-        else if (arg == "-osrt" || arg == "--output-srt")      { params.output_srt      = true; }
-        else if (arg == "-owts" || arg == "--output-words")    { params.output_wts      = true; }
-        else if (arg == "-olrc" || arg == "--output-lrc")      { params.output_lrc      = true; }
-        else if (arg == "-fp"   || arg == "--font-path")       { params.font_path       = argv[++i]; }
-        else if (arg == "-ocsv" || arg == "--output-csv")      { params.output_csv      = true; }
-        else if (arg == "-oj"   || arg == "--output-json")     { params.output_jsn      = true; }
-        else if (arg == "-of"   || arg == "--output-file")     { params.fname_out.emplace_back(argv[++i]); }
-        else if (arg == "-ps"   || arg == "--print-special")   { params.print_special   = true; }
-        else if (arg == "-pc"   || arg == "--print-colors")    { params.print_colors    = true; }
-        else if (arg == "-pp"   || arg == "--print-progress")  { params.print_progress  = true; }
-        else if (arg == "-nt"   || arg == "--no-timestamps")   { params.no_timestamps   = true; }
-        else if (arg == "-l"    || arg == "--language")        { params.language        = argv[++i]; }
-        else if (arg == "-dl"   || arg == "--detect-language") { params.detect_language = true; }
-        else if (                  arg == "--prompt")          { params.prompt          = argv[++i]; }
-        else if (arg == "-m"    || arg == "--model")           { params.model           = argv[++i]; }
-        else if (arg == "-f"    || arg == "--file")            { params.fname_inp.emplace_back(argv[++i]); }
-        else if (arg == "-oved" || arg == "--ov-e-device")     { params.openvino_encode_device = argv[++i]; }
-        else if (arg == "-ls"   || arg == "--log-score")       { params.log_score = true; }
-        else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            whisper_print_usage(argc, argv, params);
-            exit(0);
-        }
-    }
-
-    return true;
-}
-
-void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
-    fprintf(stderr, "\n");
-    fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,        --help              [default] show this help message and exit\n");
-    fprintf(stderr, "  -t N,      --threads N         [%-7d] number of threads to use during computation\n",    params.n_threads);
-    fprintf(stderr, "  -p N,      --processors N      [%-7d] number of processors to use during computation\n", params.n_processors);
-    fprintf(stderr, "  -ot N,     --offset-t N        [%-7d] time offset in milliseconds\n",                    params.offset_t_ms);
-    fprintf(stderr, "  -on N,     --offset-n N        [%-7d] segment index offset\n",                           params.offset_n);
-    fprintf(stderr, "  -d  N,     --duration N        [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
-    fprintf(stderr, "  -mc N,     --max-context N     [%-7d] maximum number of text context tokens to store\n", params.max_context);
-    fprintf(stderr, "  -ml N,     --max-len N         [%-7d] maximum segment length in characters\n",           params.max_len);
-    fprintf(stderr, "  -sow,      --split-on-word     [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
-    fprintf(stderr, "  -bo N,     --best-of N         [%-7d] number of best candidates to keep\n",              params.best_of);
-    fprintf(stderr, "  -bs N,     --beam-size N       [%-7d] beam size for beam search\n",                      params.beam_size);
-    fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
-    fprintf(stderr, "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
-    fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
-    // fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
-    fprintf(stderr, "  -debug,    --debug-mode        [%-7s] enable debug mode (eg. dump log_mel)\n",           params.debug_mode ? "true" : "false");
-    fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
-    fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
-    fprintf(stderr, "  -tdrz,     --tinydiarize       [%-7s] enable tinydiarize (requires a tdrz model)\n",     params.tinydiarize ? "true" : "false");
-    fprintf(stderr, "  -nf,       --no-fallback       [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
-    fprintf(stderr, "  -otxt,     --output-txt        [%-7s] output result in a text file\n",                   params.output_txt ? "true" : "false");
-    fprintf(stderr, "  -ovtt,     --output-vtt        [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
-    fprintf(stderr, "  -osrt,     --output-srt        [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
-    fprintf(stderr, "  -olrc,     --output-lrc        [%-7s] output result in a lrc file\n",                    params.output_lrc ? "true" : "false");
-    fprintf(stderr, "  -owts,     --output-words      [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
-    fprintf(stderr, "  -fp,       --font-path         [%-7s] path to a monospace font for karaoke video\n",     params.font_path.c_str());
-    fprintf(stderr, "  -ocsv,     --output-csv        [%-7s] output result in a CSV file\n",                    params.output_csv ? "true" : "false");
-    fprintf(stderr, "  -oj,       --output-json       [%-7s] output result in a JSON file\n",                   params.output_jsn ? "true" : "false");
-    fprintf(stderr, "  -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n",      "");
-    fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
-    fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
-    fprintf(stderr, "  -pp,       --print-progress    [%-7s] print progress\n",                                 params.print_progress ? "true" : "false");
-    fprintf(stderr, "  -nt,       --no-timestamps     [%-7s] do not print timestamps\n",                        params.no_timestamps ? "true" : "false");
-    fprintf(stderr, "  -l LANG,   --language LANG     [%-7s] spoken language ('auto' for auto-detect)\n",       params.language.c_str());
-    fprintf(stderr, "  -dl,       --detect-language   [%-7s] exit after automatically detecting language\n",    params.detect_language ? "true" : "false");
-    fprintf(stderr, "             --prompt PROMPT     [%-7s] initial prompt\n",                                 params.prompt.c_str());
-    fprintf(stderr, "  -m FNAME,  --model FNAME       [%-7s] model path\n",                                     params.model.c_str());
-    fprintf(stderr, "  -f FNAME,  --file FNAME        [%-7s] input WAV file path\n",                            "");
-    fprintf(stderr, "  -oved D,   --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n",  params.openvino_encode_device.c_str());
-    fprintf(stderr, "  -ls,       --log-score         [%-7s] log best decoder scores of tokens\n",              params.log_score?"true":"false");
-    fprintf(stderr, "\n");
-}
-
-struct whisper_print_user_data {
-    const whisper_params * params;
-
-    const std::vector<std::vector<float>> * pcmf32s;
-    int progress_prev;
-};
-
-std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
-    std::string speaker = "";
-    const int64_t n_samples = pcmf32s[0].size();
-
-    const int64_t is0 = timestamp_to_sample(t0, n_samples);
-    const int64_t is1 = timestamp_to_sample(t1, n_samples);
-
-    double energy0 = 0.0f;
-    double energy1 = 0.0f;
-
-    for (int64_t j = is0; j < is1; j++) {
-        energy0 += fabs(pcmf32s[0][j]);
-        energy1 += fabs(pcmf32s[1][j]);
-    }
-
-    if (energy0 > 1.1*energy1) {
-        speaker = "0";
-    } else if (energy1 > 1.1*energy0) {
-        speaker = "1";
-    } else {
-        speaker = "?";
-    }
-
-    //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, speaker = %s\n", is0, is1, energy0, energy1, speaker.c_str());
-
-    if (!id_only) {
-        speaker.insert(0, "(speaker ");
-        speaker.append(")");
-    }
-
-    return speaker;
-}
-void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
-    int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
-    int * progress_prev  = &(((whisper_print_user_data *) user_data)->progress_prev);
-    if (progress >= *progress_prev + progress_step) {
-        *progress_prev += progress_step;
-        fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress);
-    }
-}
-
-void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
-    const auto & params  = *((whisper_print_user_data *) user_data)->params;
-    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;
-
-    const int n_segments = whisper_full_n_segments(ctx);
-
-    std::string speaker = "";
-
-    int64_t t0 = 0;
-    int64_t t1 = 0;
-
-    // print the last n_new segments
-    const int s0 = n_segments - n_new;
-
-    if (s0 == 0) {
-        printf("\n");
-    }
-
-    for (int i = s0; i < n_segments; i++) {
-        if (!params.no_timestamps || params.diarize) {
-            t0 = whisper_full_get_segment_t0(ctx, i);
-            t1 = whisper_full_get_segment_t1(ctx, i);
-        }
-
-        if (!params.no_timestamps) {
-            printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
-        }
-
-        if (params.diarize && pcmf32s.size() == 2) {
-            speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
-        }
-
-        if (params.print_colors) {
-            for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
-                if (params.print_special == false) {
-                    const whisper_token id = whisper_full_get_token_id(ctx, i, j);
-                    if (id >= whisper_token_eot(ctx)) {
-                        continue;
-                    }
-                }
-
-                const char * text = whisper_full_get_token_text(ctx, i, j);
-                const float  p    = whisper_full_get_token_p   (ctx, i, j);
-
-                const int col = std::max(0, std::min((int) k_colors.size() - 1, (int) (std::pow(p, 3)*float(k_colors.size()))));
-
-                printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
-            }
-        } else {
-            const char * text = whisper_full_get_segment_text(ctx, i);
-
-            printf("%s%s", speaker.c_str(), text);
-        }
-
-        if (params.tinydiarize) {
-            if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
-                printf("%s", params.tdrz_speaker_turn.c_str());
-            }
-        }
-
-        // with timestamps or speakers: each segment on new line
-        if (!params.no_timestamps || params.diarize) {
-            printf("\n");
-        }
-
-        fflush(stdout);
-    }
-}
-
-bool output_txt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
-    std::ofstream fout(fname);
-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-
-    const int n_segments = whisper_full_n_segments(ctx);
-    for (int i = 0; i < n_segments; ++i) {
-        const char * text = whisper_full_get_segment_text(ctx, i);
-        std::string speaker = "";
-
-        if (params.diarize && pcmf32s.size() == 2)
-        {
-            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-            speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
-        }
-
-        fout << speaker << text << "\n";
-    }
-
-    return true;
-}
-
-bool output_vtt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
-    std::ofstream fout(fname);
-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-
-    fout << "WEBVTT\n\n";
-
-    const int n_segments = whisper_full_n_segments(ctx);
-    for (int i = 0; i < n_segments; ++i) {
-        const char * text = whisper_full_get_segment_text(ctx, i);
-        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-        std::string speaker = "";
-
-        if (params.diarize && pcmf32s.size() == 2)
-        {
-            speaker = estimate_diarization_speaker(pcmf32s, t0, t1, true);
-            speaker.insert(0, "<v Speaker");
-            speaker.append(">");
-        }
-
-        fout << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n";
-        fout << speaker << text << "\n\n";
-    }
-
-    return true;
-}
-
-bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
-    std::ofstream fout(fname);
-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-
-    const int n_segments = whisper_full_n_segments(ctx);
-    for (int i = 0; i < n_segments; ++i) {
-        const char * text = whisper_full_get_segment_text(ctx, i);
-        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-        std::string speaker = "";
-
-        if (params.diarize && pcmf32s.size() == 2)
-        {
-            speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
-        }
-
-        fout << i + 1 + params.offset_n << "\n";
-        fout << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n";
-        fout << speaker << text << "\n\n";
-    }
-
-    return true;
-}
-
-char *escape_double_quotes_and_backslashes(const char *str) {
-    if (str == NULL) {
-        return NULL;
-    }
-
-    size_t escaped_length = strlen(str) + 1;
-
-    for (size_t i = 0; str[i] != '\0'; i++) {
-        if (str[i] == '"' || str[i] == '\\') {
-            escaped_length++;
-        }
-    }
-
-    char *escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
-    if (escaped == NULL) {
-        return NULL;
-    }
-
-    size_t pos = 0;
-    for (size_t i = 0; str[i] != '\0'; i++) {
-        if (str[i] == '"' || str[i] == '\\') {
-            escaped[pos++] = '\\';
-        }
-        escaped[pos++] = str[i];
-    }
-
-    // no need to set zero due to calloc() being used prior
-
-    return escaped;
-}
-
-bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
-    std::ofstream fout(fname);
-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-
-    const int n_segments = whisper_full_n_segments(ctx);
-    fout << "start,end,";
-    if (params.diarize && pcmf32s.size() == 2)
-    {
-        fout << "speaker,";
-    }
-    fout << "text\n";
-
-    for (int i = 0; i < n_segments; ++i) {
-        const char * text = whisper_full_get_segment_text(ctx, i);
-        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-        char * text_escaped = escape_double_quotes_and_backslashes(text);
-
-        //need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
-        fout << 10 * t0 << "," << 10 * t1 << ",";
-        if (params.diarize && pcmf32s.size() == 2)
-        {
-            fout << estimate_diarization_speaker(pcmf32s, t0, t1, true) << ",";
-        }
-        fout << "\"" << text_escaped << "\"\n";
-    }
-
-    return true;
-}
-
-bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
-    std::ofstream fout(fname);
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-
-    const int n_segments = whisper_full_n_segments(ctx);
-    // fprintf(stderr,"segments: %d\n",n_segments);
-    for (int i = 0; i < n_segments; ++i) {
-        const int n_tokens = whisper_full_n_tokens(ctx, i);
-        // fprintf(stderr,"tokens: %d\n",n_tokens);
-        for (int j = 0; j < n_tokens; j++) {
-            auto token = whisper_full_get_token_text(ctx, i, j);
-            auto probability = whisper_full_get_token_p(ctx, i, j);
-            fout << token << '\t' << probability << std::endl;
-            // fprintf(stderr,"token: %s %f\n",token,probability);
-	    }
-    }
-    return true;
-}
-
-bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
-    std::ofstream fout(fname);
-    int indent = 0;
-
-    auto doindent = [&]() {
-        for (int i = 0; i < indent; i++) fout << "\t";
-    };
-
-    auto start_arr = [&](const char *name) {
-        doindent();
-        fout << "\"" << name << "\": [\n";
-        indent++;
-    };
-
-    auto end_arr = [&](bool end) {
-        indent--;
-        doindent();
-        fout << (end ? "]\n" : "},\n");
-    };
-
-    auto start_obj = [&](const char *name) {
-        doindent();
-        if (name) {
-            fout << "\"" << name << "\": {\n";
-        } else {
-            fout << "{\n";
-        }
-        indent++;
-    };
-
-    auto end_obj = [&](bool end) {
-        indent--;
-        doindent();
-        fout << (end ? "}\n" : "},\n");
-    };
-
-    auto start_value = [&](const char *name) {
-        doindent();
-        fout << "\"" << name << "\": ";
-    };
-
-    auto value_s = [&](const char *name, const char *val, bool end) {
-        start_value(name);
-        char * val_escaped = escape_double_quotes_and_backslashes(val);
-        fout << "\"" << val_escaped << (end ? "\"\n" : "\",\n");
-        free(val_escaped);
-    };
-
-    auto end_value = [&](bool end) {
-        fout << (end ? "\n" : ",\n");
-    };
-
-    auto value_i = [&](const char *name, const int64_t val, bool end) {
-        start_value(name);
-        fout << val;
-        end_value(end);
-    };
-
-    auto value_b = [&](const char *name, const bool val, bool end) {
-        start_value(name);
-        fout << (val ? "true" : "false");
-        end_value(end);
-    };
-
-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-    start_obj(nullptr);
-        value_s("systeminfo", whisper_print_system_info(), false);
-        start_obj("model");
-            value_s("type", whisper_model_type_readable(ctx), false);
-            value_b("multilingual", whisper_is_multilingual(ctx), false);
-            value_i("vocab", whisper_model_n_vocab(ctx), false);
-            start_obj("audio");
-                value_i("ctx", whisper_model_n_audio_ctx(ctx), false);
-                value_i("state", whisper_model_n_audio_state(ctx), false);
-                value_i("head", whisper_model_n_audio_head(ctx), false);
-                value_i("layer", whisper_model_n_audio_layer(ctx), true);
-            end_obj(false);
-            start_obj("text");
-                value_i("ctx", whisper_model_n_text_ctx(ctx), false);
-                value_i("state", whisper_model_n_text_state(ctx), false);
-                value_i("head", whisper_model_n_text_head(ctx), false);
-                value_i("layer", whisper_model_n_text_layer(ctx), true);
-            end_obj(false);
-            value_i("mels", whisper_model_n_mels(ctx), false);
-            value_i("ftype", whisper_model_ftype(ctx), true);
-        end_obj(false);
-        start_obj("params");
-            value_s("model", params.model.c_str(), false);
-            value_s("language", params.language.c_str(), false);
-            value_b("translate", params.translate, true);
-        end_obj(false);
-        start_obj("result");
-            value_s("language", whisper_lang_str(whisper_full_lang_id(ctx)), true);
-        end_obj(false);
-        start_arr("transcription");
-
-            const int n_segments = whisper_full_n_segments(ctx);
-            for (int i = 0; i < n_segments; ++i) {
-                const char * text = whisper_full_get_segment_text(ctx, i);
-
-                const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-                const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-
-                start_obj(nullptr);
-                    start_obj("timestamps");
-                        value_s("from", to_timestamp(t0, true).c_str(), false);
-                        value_s("to", to_timestamp(t1, true).c_str(), true);
-                    end_obj(false);
-                    start_obj("offsets");
-                        value_i("from", t0 * 10, false);
-                        value_i("to", t1 * 10, true);
-                    end_obj(false);
-                    value_s("text", text, !params.diarize && !params.tinydiarize);
-
-                    if (params.diarize && pcmf32s.size() == 2) {
-                        value_s("speaker", estimate_diarization_speaker(pcmf32s, t0, t1, true).c_str(), true);
-                    }
-
-                    if (params.tinydiarize) {
-                        value_b("speaker_turn_next", whisper_full_get_segment_speaker_turn_next(ctx, i), true);
-                    }
-                end_obj(i == (n_segments - 1));
-            }
-
-        end_arr(true);
-    end_obj(true);
-    return true;
-}
-
-// karaoke video generation
-// outputs a bash script that uses ffmpeg to generate a video with the subtitles
-// TODO: font parameter adjustments
-bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec, std::vector<std::vector<float>> pcmf32s) {
-    std::ofstream fout(fname);
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-
-    static const char * font = params.font_path.c_str();
-
-    std::ifstream fin(font);
-    if (!fin.is_open()) {
-        fprintf(stderr, "%s: font not found at '%s', please specify a monospace font with -fp\n", __func__, font);
-        return false;
-    }
-
-    fout << "#!/bin/bash" << "\n";
-    fout << "\n";
-
-    fout << "ffmpeg -i " << fname_inp << " -f lavfi -i color=size=1200x120:duration=" << t_sec << ":rate=25:color=black -vf \"";
-
-    for (int i = 0; i < whisper_full_n_segments(ctx); i++) {
-        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-
-        const int n = whisper_full_n_tokens(ctx, i);
-
-        std::vector<whisper_token_data> tokens(n);
-        for (int j = 0; j < n; ++j) {
-            tokens[j] = whisper_full_get_token_data(ctx, i, j);
-        }
-
-        if (i > 0) {
-            fout << ",";
-        }
-
-        // background text
-        fout << "drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='':enable='between(t," << t0/100.0 << "," << t0/100.0 << ")'";
-
-        bool is_first = true;
-        std::string speaker = "";
-
-        if (params.diarize && pcmf32s.size() == 2) {
-            speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
-        }
-
-        for (int j = 0; j < n; ++j) {
-            const auto & token = tokens[j];
-
-            if (tokens[j].id >= whisper_token_eot(ctx)) {
-                continue;
-            }
-
-            std::string txt_bg = "";
-            std::string txt_fg = ""; // highlight token
-            std::string txt_ul = ""; // underline
-
-            if (params.diarize && pcmf32s.size() == 2) {
-                txt_bg = speaker;
-                txt_fg = speaker;
-                txt_ul = "\\ \\ \\ \\ \\ \\ \\ \\ \\ \\ \\ ";
-            }
-
-            txt_bg.append("> ");
-            txt_fg.append("> ");
-            txt_ul.append("\\ \\ ");
-
-            {
-                for (int k = 0; k < n; ++k) {
-                    const auto & token2 = tokens[k];
-
-                    if (tokens[k].id >= whisper_token_eot(ctx)) {
-                        continue;
-                    }
-
-                    const std::string txt = whisper_token_to_str(ctx, token2.id);
-
-                    txt_bg += txt;
-
-                    if (k == j) {
-                        for (int l = 0; l < (int) txt.size(); ++l) {
-                            txt_fg += txt[l];
-                            txt_ul += "_";
-                        }
-                        txt_fg += "|";
-                    } else {
-                        for (int l = 0; l < (int) txt.size(); ++l) {
-                            txt_fg += "\\ ";
-                            txt_ul += "\\ ";
-                        }
-                    }
-                }
-
-                ::replace_all(txt_bg, "'", "\u2019");
-                ::replace_all(txt_bg, "\"", "\\\"");
-                ::replace_all(txt_fg, "'", "\u2019");
-                ::replace_all(txt_fg, "\"", "\\\"");
-            }
-
-            if (is_first) {
-                // background text
-                fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='" << txt_bg << "':enable='between(t," << t0/100.0 << "," << t1/100.0 << ")'";
-                is_first = false;
-            }
-
-            // foreground text
-            fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=lightgreen:x=(w-text_w)/2+8:y=h/2:text='" << txt_fg << "':enable='between(t," << token.t0/100.0 << "," << token.t1/100.0 << ")'";
-
-            // underline
-            fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=lightgreen:x=(w-text_w)/2+8:y=h/2+16:text='" << txt_ul << "':enable='between(t," << token.t0/100.0 << "," << token.t1/100.0 << ")'";
-        }
-    }
-
-    fout << "\" -c:v libx264 -pix_fmt yuv420p -y " << fname_inp << ".mp4" << "\n";
-
-    fout << "\n\n";
-    fout << "echo \"Your video has been saved to " << fname_inp << ".mp4\"" << "\n";
-    fout << "\n";
-    fout << "echo \"  ffplay " << fname_inp << ".mp4\"\n";
-    fout << "\n";
-
-    fout.close();
-
-    fprintf(stderr, "%s: run 'source %s' to generate karaoke video\n", __func__, fname);
-
-    return true;
-}
-
-bool output_lrc(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
-    std::ofstream fout(fname);
-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-
-    fout << "[by:whisper.cpp]\n";
-
-    const int n_segments = whisper_full_n_segments(ctx);
-    for (int i = 0; i < n_segments; ++i) {
-        const char * text = whisper_full_get_segment_text(ctx, i);
-        const int64_t t = whisper_full_get_segment_t0(ctx, i);
-
-        int64_t msec = t * 10;
-        int64_t min = msec / (1000 * 60);
-        msec = msec - min * (1000 * 60);
-        int64_t sec = msec / 1000;
-        msec = msec - sec * 1000;
-
-        char buf[16];
-        snprintf(buf, sizeof(buf), "%02d:%02d.%02d", (int) min, (int) sec, (int) ( msec / 10));
-        std::string timestamp_lrc = std::string(buf);
-        std::string speaker = "";
-
-        if (params.diarize && pcmf32s.size() == 2)
-        {
-            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-            speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
-        }
-
-        fout <<  '[' << timestamp_lrc << ']' << speaker << text << "\n";
-    }
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    whisper_params params;
-
-    if (whisper_params_parse(argc, argv, params) == false) {
-        whisper_print_usage(argc, argv, params);
-        return 1;
-    }
-
-    if (params.fname_inp.empty()) {
-        fprintf(stderr, "error: no input files specified\n");
-        whisper_print_usage(argc, argv, params);
-        return 2;
-    }
-
-    if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
-        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
-        whisper_print_usage(argc, argv, params);
-        exit(0);
-    }
-
-    if (params.diarize && params.tinydiarize) {
-        fprintf(stderr, "error: cannot use both --diarize and --tinydiarize\n");
-        whisper_print_usage(argc, argv, params);
-        exit(0);
-    }
-
-    // whisper init
-
-    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
-
-    if (ctx == nullptr) {
-        fprintf(stderr, "error: failed to initialize whisper context\n");
-        return 3;
-    }
-
-    // initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
-    whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
-
-    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
-        const auto fname_inp = params.fname_inp[f];
-		const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
-
-        std::vector<float> pcmf32;               // mono-channel F32 PCM
-        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
-
-        if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
-            fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
-            continue;
-        }
-
-        // print system information
-        {
-            fprintf(stderr, "\n");
-            fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
-                    params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
-        }
-
-        // print some info about the processing
-        {
-            fprintf(stderr, "\n");
-            if (!whisper_is_multilingual(ctx)) {
-                if (params.language != "en" || params.translate) {
-                    params.language = "en";
-                    params.translate = false;
-                    fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
-                }
-            }
-            if (params.detect_language) {
-                params.language = "auto";
-            }
-            fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, %stimestamps = %d ...\n",
-                    __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
-                    params.n_threads, params.n_processors,
-                    params.language.c_str(),
-                    params.translate ? "translate" : "transcribe",
-                    params.tinydiarize ? "tdrz = 1, " : "",
-                    params.no_timestamps ? 0 : 1);
-
-            fprintf(stderr, "\n");
-        }
-
-        // run the inference
-        {
-            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-
-            wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
-
-            wparams.print_realtime   = false;
-            wparams.print_progress   = params.print_progress;
-            wparams.print_timestamps = !params.no_timestamps;
-            wparams.print_special    = params.print_special;
-            wparams.translate        = params.translate;
-            wparams.language         = params.language.c_str();
-            wparams.detect_language  = params.detect_language;
-            wparams.n_threads        = params.n_threads;
-            wparams.n_max_text_ctx   = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
-            wparams.offset_ms        = params.offset_t_ms;
-            wparams.duration_ms      = params.duration_ms;
-
-            wparams.token_timestamps = params.output_wts || params.max_len > 0;
-            wparams.thold_pt         = params.word_thold;
-            wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
-            wparams.split_on_word    = params.split_on_word;
-
-            wparams.speed_up         = params.speed_up;
-            wparams.debug_mode       = params.debug_mode;
-
-            wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]
-
-            wparams.initial_prompt   = params.prompt.c_str();
-
-            wparams.greedy.best_of        = params.best_of;
-            wparams.beam_search.beam_size = params.beam_size;
-
-            wparams.temperature_inc  = params.no_fallback ? 0.0f : wparams.temperature_inc;
-            wparams.entropy_thold    = params.entropy_thold;
-            wparams.logprob_thold    = params.logprob_thold;
-
-            whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
-
-            // this callback is called on each new segment
-            if (!wparams.print_realtime) {
-                wparams.new_segment_callback           = whisper_print_segment_callback;
-                wparams.new_segment_callback_user_data = &user_data;
-            }
-
-            if (wparams.print_progress) {
-                wparams.progress_callback           = whisper_print_progress_callback;
-                wparams.progress_callback_user_data = &user_data;
-            }
-
-            // example for abort mechanism
-            // in this example, we do not abort the processing, but we could if the flag is set to true
-            // the callback is called before every encoder run - if it returns false, the processing is aborted
-            {
-                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
-
-                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
-                    bool is_aborted = *(bool*)user_data;
-                    return !is_aborted;
-                };
-                wparams.encoder_begin_callback_user_data = &is_aborted;
-            }
-
-            if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
-                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
-                return 10;
-            }
-        }
-
-        // output stuff
-        {
-            printf("\n");
-
-            // output to text file
-            if (params.output_txt) {
-                const auto fname_txt = fname_out + ".txt";
-                output_txt(ctx, fname_txt.c_str(), params, pcmf32s);
-            }
-
-            // output to VTT file
-            if (params.output_vtt) {
-                const auto fname_vtt = fname_out + ".vtt";
-                output_vtt(ctx, fname_vtt.c_str(), params, pcmf32s);
-            }
-
-            // output to SRT file
-            if (params.output_srt) {
-                const auto fname_srt = fname_out + ".srt";
-                output_srt(ctx, fname_srt.c_str(), params, pcmf32s);
-            }
-
-            // output to WTS file
-            if (params.output_wts) {
-                const auto fname_wts = fname_out + ".wts";
-                output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE, pcmf32s);
-            }
-
-            // output to CSV file
-            if (params.output_csv) {
-                const auto fname_csv = fname_out + ".csv";
-                output_csv(ctx, fname_csv.c_str(), params, pcmf32s);
-            }
-
-            // output to JSON file
-            if (params.output_jsn) {
-                const auto fname_jsn = fname_out + ".json";
-                output_json(ctx, fname_jsn.c_str(), params, pcmf32s);
-            }
-
-            // output to LRC file
-            if (params.output_lrc) {
-                const auto fname_lrc = fname_out + ".lrc";
-                output_lrc(ctx, fname_lrc.c_str(), params, pcmf32s);
-            }
-
-            // output to score file
-            if (params.log_score) {
-                const auto fname_score = fname_out + ".score.txt";
-                output_score(ctx, fname_score.c_str(), params, pcmf32s);
-            }
-        }
-    }
-
-    whisper_print_timings(ctx);
-    whisper_free(ctx);
-
-    return 0;
-}

+ 0 - 223
ggml/examples/whisper/quantize.cpp

@@ -1,223 +0,0 @@
-#include "ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include <regex>
-
-// default hparams (Whisper tiny)
-struct whisper_hparams {
-    int32_t n_vocab       = 51864;
-    int32_t n_audio_ctx   = 1500;
-    int32_t n_audio_state = 384;
-    int32_t n_audio_head  = 6;
-    int32_t n_audio_layer = 4;
-    int32_t n_text_ctx    = 448;
-    int32_t n_text_state  = 384;
-    int32_t n_text_head   = 6;
-    int32_t n_text_layer  = 4;
-    int32_t n_mels        = 80;
-    int32_t ftype         = 1;
-};
-
-struct whisper_filters {
-    int32_t n_mel;
-    int32_t n_fft;
-
-    std::vector<float> data;
-};
-
-// quantize a model
-bool whisper_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
-    gpt_vocab vocab;
-
-    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
-
-    auto finp = std::ifstream(fname_inp, std::ios::binary);
-    if (!finp) {
-        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    auto fout = std::ofstream(fname_out, std::ios::binary);
-    if (!fout) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        finp.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
-            return false;
-        }
-
-        fout.write((char *) &magic, sizeof(magic));
-    }
-
-    whisper_hparams hparams;
-
-    // load hparams
-    {
-        finp.read((char *) &hparams.n_vocab,       sizeof(hparams.n_vocab));
-        finp.read((char *) &hparams.n_audio_ctx,   sizeof(hparams.n_audio_ctx));
-        finp.read((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
-        finp.read((char *) &hparams.n_audio_head,  sizeof(hparams.n_audio_head));
-        finp.read((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
-        finp.read((char *) &hparams.n_text_ctx,    sizeof(hparams.n_text_ctx));
-        finp.read((char *) &hparams.n_text_state,  sizeof(hparams.n_text_state));
-        finp.read((char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
-        finp.read((char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
-        finp.read((char *) &hparams.n_mels,        sizeof(hparams.n_mels));
-        finp.read((char *) &hparams.ftype,         sizeof(hparams.ftype));
-
-        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
-        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
-
-        fprintf(stderr, "%s: n_vocab       = %d\n", __func__, hparams.n_vocab);
-        fprintf(stderr, "%s: n_audio_ctx   = %d\n", __func__, hparams.n_audio_ctx);
-        fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
-        fprintf(stderr, "%s: n_audio_head  = %d\n", __func__, hparams.n_audio_head);
-        fprintf(stderr, "%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
-        fprintf(stderr, "%s: n_text_ctx    = %d\n", __func__, hparams.n_text_ctx);
-        fprintf(stderr, "%s: n_text_state  = %d\n", __func__, hparams.n_text_state);
-        fprintf(stderr, "%s: n_text_head   = %d\n", __func__, hparams.n_text_head);
-        fprintf(stderr, "%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
-        fprintf(stderr, "%s: n_mels        = %d\n", __func__, hparams.n_mels);
-        fprintf(stderr, "%s: ftype (src)   = %d\n", __func__, hparams.ftype);
-        fprintf(stderr, "%s: qntvr (src)   = %d\n", __func__, qntvr_src);
-        fprintf(stderr, "%s: ftype (dst)   = %d\n", __func__, ftype_dst);
-        fprintf(stderr, "%s: qntvr (dst)   = %d\n", __func__, GGML_QNT_VERSION);
-
-        fout.write((const char *) &hparams.n_vocab,       sizeof(hparams.n_vocab));
-        fout.write((const char *) &hparams.n_audio_ctx,   sizeof(hparams.n_audio_ctx));
-        fout.write((const char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
-        fout.write((const char *) &hparams.n_audio_head,  sizeof(hparams.n_audio_head));
-        fout.write((const char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
-        fout.write((const char *) &hparams.n_text_ctx,    sizeof(hparams.n_text_ctx));
-        fout.write((const char *) &hparams.n_text_state,  sizeof(hparams.n_text_state));
-        fout.write((const char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
-        fout.write((const char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
-        fout.write((const char *) &hparams.n_mels,        sizeof(hparams.n_mels));
-        fout.write((const char *) &ftype_dst,             sizeof(hparams.ftype));
-    }
-
-    // load mel filters
-    {
-        whisper_filters filters;
-
-        finp.read ((char *) &filters.n_mel, sizeof(filters.n_mel));
-        fout.write((char *) &filters.n_mel, sizeof(filters.n_mel));
-        finp.read ((char *) &filters.n_fft, sizeof(filters.n_fft));
-        fout.write((char *) &filters.n_fft, sizeof(filters.n_fft));
-
-        filters.data.resize(filters.n_mel * filters.n_fft);
-        finp.read ((char *) filters.data.data(), filters.data.size() * sizeof(float));
-        fout.write((char *) filters.data.data(), filters.data.size() * sizeof(float));
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        finp.read ((char *) &n_vocab, sizeof(n_vocab));
-        fout.write((char *) &n_vocab, sizeof(n_vocab));
-
-        //if (n_vocab != hparams.n_vocab) {
-        //    fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-        //            __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
-        //    return false;
-        //}
-
-        char word[129];
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            finp.read ((char *) &len, sizeof(len));
-            fout.write((char *) &len, sizeof(len));
-
-            word[len] = '\0';
-
-            finp.read ((char *) word, len);
-            fout.write((char *) word, len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // regexes of tensor names to not be quantized
-    const std::vector<std::string> to_skip = {
-        //"encoder.*",
-        "encoder.conv1.bias",
-        "encoder.conv2.bias",
-        "encoder.positional_embedding",
-        "decoder.positional_embedding",
-    };
-
-    if (!ggml_common_quantize_0(finp, fout, ftype, { ".*" }, to_skip)) {
-        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    finp.close();
-    fout.close();
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    if (argc != 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        ggml_print_ftypes(stderr);
-        return 1;
-    }
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
-
-    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    int64_t t_quantize_us = 0;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!whisper_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
-            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
-            return 1;
-        }
-
-        t_quantize_us = ggml_time_us() - t_start_us;
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n");
-        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    return 0;
-}

+ 0 - 5515
ggml/examples/whisper/whisper.cpp

@@ -1,5515 +0,0 @@
-#include "whisper.h"
-#ifdef WHISPER_USE_COREML
-#include "coreml/whisper-encoder.h"
-#endif
-
-#ifdef WHISPER_USE_OPENVINO
-#include "openvino/whisper-openvino-encoder.h"
-#endif
-
-#include "ggml.h"
-
-#include <algorithm>
-#include <cassert>
-#define _USE_MATH_DEFINES
-#include <cmath>
-#include <cstdio>
-#include <cstdarg>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <thread>
-#include <vector>
-#include <regex>
-#include <random>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#if defined(GGML_BIG_ENDIAN)
-#include <bit>
-
-template<typename T>
-static T byteswap(T value) {
-    return std::byteswap(value);
-}
-
-template<>
-float byteswap(float value) {
-    return std::bit_cast<float>(byteswap(std::bit_cast<std::uint32_t>(value)));
-}
-
-template<typename T>
-static void byteswap_tensor_data(ggml_tensor * tensor) {
-    T * datum = reinterpret_cast<T *>(tensor->data);
-    for (int i = 0; i < ggml_nelements(tensor); i++) {
-        datum[i] = byteswap(datum[i]);
-    }
-}
-
-static void byteswap_tensor(ggml_tensor * tensor) {
-    switch (tensor->type) {
-        case GGML_TYPE_I16: {
-            byteswap_tensor_data<int16_t>(tensor);
-            break;
-        }
-        case GGML_TYPE_F16: {
-            byteswap_tensor_data<ggml_fp16_t>(tensor);
-            break;
-        }
-        case GGML_TYPE_I32: {
-            byteswap_tensor_data<int32_t>(tensor);
-            break;
-        }
-        case GGML_TYPE_F32: {
-            byteswap_tensor_data<float>(tensor);
-            break;
-        }
-        default: { // GML_TYPE_I8
-            break;
-        }
-    }
-}
-
-#define BYTESWAP_VALUE(d) d = byteswap(d)
-#define BYTESWAP_FILTERS(f)            \
-    do {                              \
-        for (auto & datum : f.data) { \
-            datum = byteswap(datum);  \
-        }                             \
-    } while (0)
-#define BYTESWAP_TENSOR(t)       \
-    do {                         \
-        byteswap_tensor(t); \
-    } while (0)
-#else
-#define BYTESWAP_VALUE(d) do {} while (0)
-#define BYTESWAP_FILTERS(f) do {} while (0)
-#define BYTESWAP_TENSOR(t) do {} while (0)
-#endif
-
-#define WHISPER_ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            log("WHISPER_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            abort(); \
-        } \
-    } while (0)
-
-// define this to enable verbose trace logging - useful for debugging purposes
-//#define WHISPER_DEBUG
-
-#if defined(WHISPER_DEBUG)
-#define WHISPER_PRINT_DEBUG(...) \
-    do { \
-        fprintf(stderr, __VA_ARGS__); \
-    } while (0)
-#else
-#define WHISPER_PRINT_DEBUG(...)
-#endif
-
-//#define WHISPER_USE_FLASH_ATTN
-//#define WHISPER_USE_FLASH_FF
-#define WHISPER_MAX_DECODERS 16
-
-#define WHISPER_USE_SCRATCH
-#define WHISPER_MAX_SCRATCH_BUFFERS 16
-
-// available whisper models
-enum e_model {
-    MODEL_UNKNOWN,
-    MODEL_TINY,
-    MODEL_BASE,
-    MODEL_SMALL,
-    MODEL_MEDIUM,
-    MODEL_LARGE,
-};
-
-static const std::map<std::string, std::pair<int, std::string>> g_lang = {
-    { "en",  { 0,  "english",         } },
-    { "zh",  { 1,  "chinese",         } },
-    { "de",  { 2,  "german",          } },
-    { "es",  { 3,  "spanish",         } },
-    { "ru",  { 4,  "russian",         } },
-    { "ko",  { 5,  "korean",          } },
-    { "fr",  { 6,  "french",          } },
-    { "ja",  { 7,  "japanese",        } },
-    { "pt",  { 8,  "portuguese",      } },
-    { "tr",  { 9,  "turkish",         } },
-    { "pl",  { 10, "polish",          } },
-    { "ca",  { 11,  "catalan",        } },
-    { "nl",  { 12,  "dutch",          } },
-    { "ar",  { 13,  "arabic",         } },
-    { "sv",  { 14,  "swedish",        } },
-    { "it",  { 15,  "italian",        } },
-    { "id",  { 16,  "indonesian",     } },
-    { "hi",  { 17,  "hindi",          } },
-    { "fi",  { 18,  "finnish",        } },
-    { "vi",  { 19,  "vietnamese",     } },
-    { "he",  { 20,  "hebrew",         } },
-    { "uk",  { 21,  "ukrainian",      } },
-    { "el",  { 22,  "greek",          } },
-    { "ms",  { 23,  "malay",          } },
-    { "cs",  { 24,  "czech",          } },
-    { "ro",  { 25,  "romanian",       } },
-    { "da",  { 26,  "danish",         } },
-    { "hu",  { 27,  "hungarian",      } },
-    { "ta",  { 28,  "tamil",          } },
-    { "no",  { 29,  "norwegian",      } },
-    { "th",  { 30,  "thai",           } },
-    { "ur",  { 31,  "urdu",           } },
-    { "hr",  { 32,  "croatian",       } },
-    { "bg",  { 33,  "bulgarian",      } },
-    { "lt",  { 34,  "lithuanian",     } },
-    { "la",  { 35,  "latin",          } },
-    { "mi",  { 36,  "maori",          } },
-    { "ml",  { 37,  "malayalam",      } },
-    { "cy",  { 38,  "welsh",          } },
-    { "sk",  { 39,  "slovak",         } },
-    { "te",  { 40,  "telugu",         } },
-    { "fa",  { 41,  "persian",        } },
-    { "lv",  { 42,  "latvian",        } },
-    { "bn",  { 43,  "bengali",        } },
-    { "sr",  { 44,  "serbian",        } },
-    { "az",  { 45,  "azerbaijani",    } },
-    { "sl",  { 46,  "slovenian",      } },
-    { "kn",  { 47,  "kannada",        } },
-    { "et",  { 48,  "estonian",       } },
-    { "mk",  { 49,  "macedonian",     } },
-    { "br",  { 50,  "breton",         } },
-    { "eu",  { 51,  "basque",         } },
-    { "is",  { 52,  "icelandic",      } },
-    { "hy",  { 53,  "armenian",       } },
-    { "ne",  { 54,  "nepali",         } },
-    { "mn",  { 55,  "mongolian",      } },
-    { "bs",  { 56,  "bosnian",        } },
-    { "kk",  { 57,  "kazakh",         } },
-    { "sq",  { 58,  "albanian",       } },
-    { "sw",  { 59,  "swahili",        } },
-    { "gl",  { 60,  "galician",       } },
-    { "mr",  { 61,  "marathi",        } },
-    { "pa",  { 62,  "punjabi",        } },
-    { "si",  { 63,  "sinhala",        } },
-    { "km",  { 64,  "khmer",          } },
-    { "sn",  { 65,  "shona",          } },
-    { "yo",  { 66,  "yoruba",         } },
-    { "so",  { 67,  "somali",         } },
-    { "af",  { 68,  "afrikaans",      } },
-    { "oc",  { 69,  "occitan",        } },
-    { "ka",  { 70,  "georgian",       } },
-    { "be",  { 71,  "belarusian",     } },
-    { "tg",  { 72,  "tajik",          } },
-    { "sd",  { 73,  "sindhi",         } },
-    { "gu",  { 74,  "gujarati",       } },
-    { "am",  { 75,  "amharic",        } },
-    { "yi",  { 76,  "yiddish",        } },
-    { "lo",  { 77,  "lao",            } },
-    { "uz",  { 78,  "uzbek",          } },
-    { "fo",  { 79,  "faroese",        } },
-    { "ht",  { 80,  "haitian creole", } },
-    { "ps",  { 81,  "pashto",         } },
-    { "tk",  { 82,  "turkmen",        } },
-    { "nn",  { 83,  "nynorsk",        } },
-    { "mt",  { 84,  "maltese",        } },
-    { "sa",  { 85,  "sanskrit",       } },
-    { "lb",  { 86,  "luxembourgish",  } },
-    { "my",  { 87,  "myanmar",        } },
-    { "bo",  { 88,  "tibetan",        } },
-    { "tl",  { 89,  "tagalog",        } },
-    { "mg",  { 90,  "malagasy",       } },
-    { "as",  { 91,  "assamese",       } },
-    { "tt",  { 92,  "tatar",          } },
-    { "haw", { 93,  "hawaiian",       } },
-    { "ln",  { 94,  "lingala",        } },
-    { "ha",  { 95,  "hausa",          } },
-    { "ba",  { 96,  "bashkir",        } },
-    { "jw",  { 97,  "javanese",       } },
-    { "su",  { 98,  "sundanese",      } },
-};
-
-static const size_t MB = 1ull*1024*1024;
-
-static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
-    { MODEL_TINY,     62ull*MB },
-    { MODEL_BASE,     80ull*MB },
-    { MODEL_SMALL,   120ull*MB },
-    { MODEL_MEDIUM,  158ull*MB },
-    { MODEL_LARGE,   198ull*MB },
-};
-
-static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
-    { MODEL_TINY,     18ull*MB },
-    { MODEL_BASE,     24ull*MB },
-    { MODEL_SMALL,    36ull*MB },
-    { MODEL_MEDIUM,   48ull*MB },
-    { MODEL_LARGE,    60ull*MB },
-};
-
-static const std::map<e_model, size_t> MEM_REQ_SCRATCH2 = {
-    { MODEL_TINY,      4ull*MB },
-    { MODEL_BASE,      4ull*MB },
-    { MODEL_SMALL,     6ull*MB },
-    { MODEL_MEDIUM,    7ull*MB },
-    { MODEL_LARGE,     9ull*MB },
-};
-
-static const std::map<e_model, size_t> MEM_REQ_SCRATCH3 = {
-    { MODEL_TINY,      4ull*MB },
-    { MODEL_BASE,      4ull*MB },
-    { MODEL_SMALL,     6ull*MB },
-    { MODEL_MEDIUM,    7ull*MB },
-    { MODEL_LARGE,     9ull*MB },
-};
-
-static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
-    { GGML_TYPE_F32,
-        {
-            { MODEL_TINY,     74ull*MB },
-            { MODEL_BASE,    142ull*MB },
-            { MODEL_SMALL,   466ull*MB },
-            { MODEL_MEDIUM, 1464ull*MB },
-            { MODEL_LARGE,  2952ull*MB },
-        },
-    },
-    { GGML_TYPE_F16,
-        {
-            { MODEL_TINY,     74ull*MB },
-            { MODEL_BASE,    142ull*MB },
-            { MODEL_SMALL,   466ull*MB },
-            { MODEL_MEDIUM, 1464ull*MB },
-            { MODEL_LARGE,  2952ull*MB },
-        },
-    },
-    { GGML_TYPE_Q4_0,
-        {
-            { MODEL_TINY,     26ull*MB },
-            { MODEL_BASE,     50ull*MB },
-            { MODEL_SMALL,   154ull*MB },
-            { MODEL_MEDIUM,  470ull*MB },
-            { MODEL_LARGE,   940ull*MB },
-        },
-    },
-    { GGML_TYPE_Q4_1,
-        {
-            { MODEL_TINY,     32ull*MB },
-            { MODEL_BASE,     58ull*MB },
-            { MODEL_SMALL,   182ull*MB },
-            { MODEL_MEDIUM,  562ull*MB },
-            { MODEL_LARGE,  1124ull*MB },
-        },
-    },
-    { GGML_TYPE_Q5_0,
-        {
-            { MODEL_TINY,     30ull*MB },
-            { MODEL_BASE,     54ull*MB },
-            { MODEL_SMALL,   170ull*MB },
-            { MODEL_MEDIUM,  516ull*MB },
-            { MODEL_LARGE,  1034ull*MB },
-        },
-    },
-    { GGML_TYPE_Q5_1,
-        {
-            { MODEL_TINY,     32ull*MB },
-            { MODEL_BASE,     58ull*MB },
-            { MODEL_SMALL,   182ull*MB },
-            { MODEL_MEDIUM,  562ull*MB },
-            { MODEL_LARGE,  1124ull*MB },
-        },
-    },
-    { GGML_TYPE_Q8_0,
-        {
-            { MODEL_TINY,     45ull*MB },
-            { MODEL_BASE,     84ull*MB },
-            { MODEL_SMALL,   268ull*MB },
-            { MODEL_MEDIUM,  834ull*MB },
-            { MODEL_LARGE,  1674ull*MB },
-        },
-    },
-};
-
-static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
-    { MODEL_TINY,      3ull*MB },
-    { MODEL_BASE,      6ull*MB },
-    { MODEL_SMALL,    16ull*MB },
-    { MODEL_MEDIUM,   43ull*MB },
-    { MODEL_LARGE,    71ull*MB },
-};
-
-static const std::map<e_model, size_t> MEM_REQ_KV_CROSS = {
-    { MODEL_TINY,      9ull*MB },
-    { MODEL_BASE,     18ull*MB },
-    { MODEL_SMALL,    53ull*MB },
-    { MODEL_MEDIUM,  141ull*MB },
-    { MODEL_LARGE,   235ull*MB },
-};
-
-static const std::map<e_model, size_t> MEM_REQ_ENCODE = {
-    { MODEL_TINY,     30ull*MB },
-    { MODEL_BASE,     38ull*MB },
-    { MODEL_SMALL,    56ull*MB },
-    { MODEL_MEDIUM,   74ull*MB },
-    { MODEL_LARGE,    94ull*MB },
-};
-
-static const std::map<e_model, size_t> MEM_REQ_DECODE = {
-    { MODEL_TINY,      3ull*MB },
-    { MODEL_BASE,      5ull*MB },
-    { MODEL_SMALL,    10ull*MB },
-    { MODEL_MEDIUM,   18ull*MB },
-    { MODEL_LARGE,    27ull*MB },
-};
-
-struct whisper_mel {
-    int n_len;
-    int n_len_org;
-    int n_mel;
-
-    std::vector<float> data;
-};
-
-struct whisper_filters {
-    int32_t n_mel;
-    int32_t n_fft;
-
-    std::vector<float> data;
-};
-
-struct whisper_vocab {
-    using id    = int32_t;
-    using token = std::string;
-
-    int n_vocab = 51864;
-
-    std::map<token, id> token_to_id;
-    std::map<id, token> id_to_token;
-
-    // reference: https://github.com/openai/whisper/blob/248b6cb124225dd263bb9bd32d060b6517e067f8/whisper/tokenizer.py#L334-L349
-    id token_eot        = 50256;
-    id token_sot        = 50257;
-    // task tokens (used only for multilingual models)
-    id token_translate  = 50357;
-    id token_transcribe = 50358;
-    // other special tokens
-    id token_solm       = 50359; // [TDRZ] used by tinydiarize models to indicate speaker turn
-    id token_prev       = 50360;
-    id token_nosp       = 50361;
-    id token_not        = 50362; // no timestamps
-    id token_beg        = 50363; // begin timestamps
-
-    bool is_multilingual() const {
-        return n_vocab == 51865;
-    }
-};
-
-struct whisper_segment {
-    int64_t t0;
-    int64_t t1;
-
-    std::string text;
-
-    std::vector<whisper_token_data> tokens;
-
-    bool speaker_turn_next;
-};
-
-// medium
-// hparams: {
-// 'n_mels': 80,
-// 'n_vocab': 51864,
-// 'n_audio_ctx': 1500,
-// 'n_audio_state': 1024,
-// 'n_audio_head': 16,
-// 'n_audio_layer': 24,
-// 'n_text_ctx': 448,
-// 'n_text_state': 1024,
-// 'n_text_head': 16,
-// 'n_text_layer': 24
-// }
-//
-// default hparams (Whisper tiny)
-struct whisper_hparams {
-    int32_t n_vocab       = 51864;
-    int32_t n_audio_ctx   = 1500;
-    int32_t n_audio_state = 384;
-    int32_t n_audio_head  = 6;
-    int32_t n_audio_layer = 4;
-    int32_t n_text_ctx    = 448;
-    int32_t n_text_state  = 384;
-    int32_t n_text_head   = 6;
-    int32_t n_text_layer  = 4;
-    int32_t n_mels        = 80;
-    int32_t ftype         = 1;
-    float   eps           = 1e-5f;
-};
-
-// audio encoding layer
-struct whisper_layer_encoder {
-    // encoder.blocks.*.attn_ln
-    struct ggml_tensor * attn_ln_0_w;
-    struct ggml_tensor * attn_ln_0_b;
-
-    // encoder.blocks.*.attn.out
-    struct ggml_tensor * attn_ln_1_w;
-    struct ggml_tensor * attn_ln_1_b;
-
-    // encoder.blocks.*.attn.query
-    struct ggml_tensor * attn_q_w;
-    struct ggml_tensor * attn_q_b;
-
-    // encoder.blocks.*.attn.key
-    struct ggml_tensor * attn_k_w;
-
-    // encoder.blocks.*.attn.value
-    struct ggml_tensor * attn_v_w;
-    struct ggml_tensor * attn_v_b;
-
-    // encoder.blocks.*.mlp_ln
-    struct ggml_tensor * mlp_ln_w;
-    struct ggml_tensor * mlp_ln_b;
-
-    // encoder.blocks.*.mlp.0
-    struct ggml_tensor * mlp_0_w;
-    struct ggml_tensor * mlp_0_b;
-
-    // encoder.blocks.*.mlp.2
-    struct ggml_tensor * mlp_1_w;
-    struct ggml_tensor * mlp_1_b;
-};
-
-// token decoding layer
-struct whisper_layer_decoder {
-    // decoder.blocks.*.attn_ln
-    struct ggml_tensor * attn_ln_0_w;
-    struct ggml_tensor * attn_ln_0_b;
-
-    // decoder.blocks.*.attn.out
-    struct ggml_tensor * attn_ln_1_w;
-    struct ggml_tensor * attn_ln_1_b;
-
-    // decoder.blocks.*.attn.query
-    struct ggml_tensor * attn_q_w;
-    struct ggml_tensor * attn_q_b;
-
-    // decoder.blocks.*.attn.key
-    struct ggml_tensor * attn_k_w;
-
-    // decoder.blocks.*.attn.value
-    struct ggml_tensor * attn_v_w;
-    struct ggml_tensor * attn_v_b;
-
-    // decoder.blocks.*.cross_attn_ln
-    struct ggml_tensor * cross_attn_ln_0_w;
-    struct ggml_tensor * cross_attn_ln_0_b;
-
-    // decoder.blocks.*.cross_attn.out
-    struct ggml_tensor * cross_attn_ln_1_w;
-    struct ggml_tensor * cross_attn_ln_1_b;
-
-    // decoder.blocks.*.cross_attn.query
-    struct ggml_tensor * cross_attn_q_w;
-    struct ggml_tensor * cross_attn_q_b;
-
-    // decoder.blocks.*.cross_attn.key
-    struct ggml_tensor * cross_attn_k_w;
-
-    // decoder.blocks.*.cross_attn.value
-    struct ggml_tensor * cross_attn_v_w;
-    struct ggml_tensor * cross_attn_v_b;
-
-    // decoder.blocks.*.mlp_ln
-    struct ggml_tensor * mlp_ln_w;
-    struct ggml_tensor * mlp_ln_b;
-
-    // decoder.blocks.*.mlp.0
-    struct ggml_tensor * mlp_0_w;
-    struct ggml_tensor * mlp_0_b;
-
-    // decoder.blocks.*.mlp.2
-    struct ggml_tensor * mlp_1_w;
-    struct ggml_tensor * mlp_1_b;
-};
-
-struct whisper_kv_cache {
-    struct ggml_tensor * k;
-    struct ggml_tensor * v;
-
-    struct ggml_context * ctx;
-
-    std::vector<uint8_t> buf;
-
-    int n; // number of tokens currently in the cache
-};
-
-struct whisper_model {
-    e_model type = MODEL_UNKNOWN;
-
-    whisper_hparams hparams;
-    whisper_filters filters;
-
-    // encoder.positional_embedding
-    struct ggml_tensor * e_pe;
-
-    // encoder.conv1
-    struct ggml_tensor * e_conv_1_w;
-    struct ggml_tensor * e_conv_1_b;
-
-    // encoder.conv2
-    struct ggml_tensor * e_conv_2_w;
-    struct ggml_tensor * e_conv_2_b;
-
-    // encoder.ln_post
-    struct ggml_tensor * e_ln_w;
-    struct ggml_tensor * e_ln_b;
-
-    // decoder.positional_embedding
-    struct ggml_tensor * d_pe;
-
-    // decoder.token_embedding
-    struct ggml_tensor * d_te;
-
-    // decoder.ln
-    struct ggml_tensor * d_ln_w;
-    struct ggml_tensor * d_ln_b;
-
-    std::vector<whisper_layer_encoder> layers_encoder;
-    std::vector<whisper_layer_decoder> layers_decoder;
-
-    // context
-    struct ggml_context * ctx;
-
-    // the model memory buffer is read-only and can be shared between processors
-    std::vector<uint8_t> * buf;
-
-    // tensors
-    int n_loaded;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-struct whisper_sequence {
-    std::vector<whisper_token_data> tokens;
-
-    // the accumulated transcription in the current iteration (used to truncate the tokens array)
-    int result_len;
-
-    double sum_logprobs_all; // the sum of the log probabilities of the tokens
-    double sum_logprobs;     // the sum of the log probabilities of the tokens (first result_len tokens)
-    double avg_logprobs;     // the average log probability of the tokens
-    double entropy;          // the entropy of the tokens
-    double score;            // likelihood rank score
-};
-
-// TAGS: WHISPER_DECODER_INIT
-struct whisper_decoder {
-    // each decoders keeps its own KV-cache
-    whisper_kv_cache kv_self;
-
-    // the currently generated sequence of tokens
-    whisper_sequence sequence;
-
-    int seek_delta; // the window shift found so far based on the decoded timestamp tokens
-
-    bool failed;    // has the current segment failed to decode?
-    bool completed; // has the decoder completed the current segment?
-    bool has_ts;    // have we already sampled a non-beg timestamp token for the current segment?
-
-    // new token probs, logits and logprobs after the last whisper_decode (1-dimensional array: [n_vocab])
-    std::vector<float> probs;
-    std::vector<float> logits;
-    std::vector<float> logprobs;
-
-    std::vector<whisper_token> tokens_tmp; // used for whisper_decode calls
-};
-
-struct whisper_state {
-    int64_t t_sample_us = 0;
-    int64_t t_encode_us = 0;
-    int64_t t_decode_us = 0;
-    int64_t t_mel_us = 0;
-
-    int32_t n_sample = 0; // number of tokens sampled
-    int32_t n_encode = 0; // number of encoder calls
-    int32_t n_decode = 0; // number of decoder calls
-    int32_t n_fail_p = 0; // number of logprob threshold failures
-    int32_t n_fail_h = 0; // number of entropy threshold failures
-
-    // cross-attention KV cache for the decoders
-    // shared between all decoders
-    whisper_kv_cache kv_cross;
-    whisper_mel mel;
-
-    whisper_decoder decoders[WHISPER_MAX_DECODERS] = {};
-
-    // memory buffers used by encode / decode contexts
-    std::vector<uint8_t> buf_compute;
-    std::vector<uint8_t> buf_scratch[WHISPER_MAX_SCRATCH_BUFFERS];
-
-    int    buf_last = 0;
-    size_t buf_max_size[WHISPER_MAX_SCRATCH_BUFFERS] = { 0 };
-
-    // decode output (2-dimensional array: [n_tokens][n_vocab])
-    std::vector<float> logits;
-
-    std::vector<whisper_segment> result_all;
-    std::vector<whisper_token>   prompt_past;
-
-    // work container used to avoid memory allocations
-    std::vector<std::pair<double, whisper_vocab::id>> logits_id;
-
-    mutable std::mt19937 rng; // used for sampling at t > 0.0
-
-    int lang_id = 0; // english by default
-
-    std::string path_model; // populated by whisper_init_from_file()
-#ifdef WHISPER_USE_COREML
-    whisper_coreml_context * ctx_coreml = nullptr;
-#endif
-
-#ifdef WHISPER_USE_OPENVINO
-    whisper_openvino_context * ctx_openvino = nullptr;
-#endif
-
-    // [EXPERIMENTAL] token-level timestamps data
-    int64_t t_beg = 0;
-    int64_t t_last = 0;
-    whisper_token tid_last;
-    std::vector<float> energy; // PCM signal energy
-
-    // [EXPERIMENTAL] speed-up techniques
-    int32_t exp_n_audio_ctx = 0; // 0 - use default
-
-    void use_buf(struct ggml_context * ctx, int i) {
-#if defined(WHISPER_USE_SCRATCH)
-        size_t last_size = 0;
-
-        if (i == -1) {
-            last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, });
-        } else {
-            auto & buf = buf_scratch[i];
-            last_size = ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), });
-        }
-
-        if (buf_last >= 0) {
-            buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
-        }
-
-        buf_last = i;
-#else
-        (void) i;
-        (void) ctx;
-#endif
-    }
-
-    size_t get_buf_max_mem(int i) const {
-#if defined(WHISPER_USE_SCRATCH)
-        return buf_max_size[i];
-#else
-        (void) i;
-        return 0;
-#endif
-    }
-};
-
-struct whisper_context {
-    int64_t t_load_us  = 0;
-    int64_t t_start_us = 0;
-
-    ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX)
-    ggml_type itype = ggml_type::GGML_TYPE_F16; // intermediate type (FP32 or FP16)
-
-    whisper_model model;
-    whisper_vocab vocab;
-    whisper_state * state = nullptr;
-
-    std::string path_model; // populated by whisper_init_from_file()
-};
-
-static void whisper_default_log(const char * text) {
-    fprintf(stderr, "%s", text);
-}
-
-static whisper_log_callback whisper_log = whisper_default_log;
-
-#ifdef __GNUC__
-#ifdef __MINGW32__
-__attribute__((gnu_format(printf, 1, 2)))
-#else
-__attribute__((format(printf, 1, 2)))
-#endif
-#endif
-static void log(const char * fmt, ...) {
-    if (!whisper_log) return;
-    char buf[1024];
-    va_list args;
-    va_start(args, fmt);
-    vsnprintf(buf, sizeof(buf), fmt, args);
-    whisper_log(buf);
-}
-
-template<typename T>
-static void read_safe(whisper_model_loader * loader, T & dest) {
-    loader->read(loader->context, &dest, sizeof(T));
-    BYTESWAP_VALUE(dest);
-}
-
-static bool kv_cache_init(
-        const struct whisper_hparams & hparams,
-                        const size_t   mem_bytes,
-             struct whisper_kv_cache & cache,
-                           ggml_type   wtype,
-                                 int   n_ctx) {
-    cache.buf.resize(mem_bytes);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ cache.buf.size(),
-        /*.mem_buffer =*/ cache.buf.data(),
-        /*.no_alloc   =*/ false,
-    };
-
-    cache.ctx = ggml_init(params);
-
-    if (!cache.ctx) {
-        log("%s: failed to allocate memory for kv cache\n", __func__);
-        return false;
-    }
-
-    const int n_text_state = hparams.n_text_state;
-    const int n_text_layer = hparams.n_text_layer;
-
-    const int n_mem      = n_text_layer*n_ctx;
-    const int n_elements = n_text_state*n_mem;
-
-    cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
-    cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
-
-    return true;
-}
-
-static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
-    WHISPER_ASSERT(cache.ctx);
-
-    const int n_elements = ggml_nelements(cache.k);
-    WHISPER_ASSERT(n_elements == ggml_nelements(cache.v));
-
-    const ggml_type wtype = cache.k->type;
-    WHISPER_ASSERT(wtype == cache.v->type);
-
-    WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_sizef(wtype));
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ cache.buf.size(),
-        /*.mem_buffer =*/ cache.buf.data(),
-        /*.no_alloc   =*/ false,
-    };
-
-    cache.ctx = ggml_init(params);
-
-    if (!cache.ctx) {
-        log("%s: failed to allocate memory for kv cache\n", __func__);
-        return false;
-    }
-
-    cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
-    cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
-
-    return true;
-}
-
-static void kv_cache_free(struct whisper_kv_cache & cache) {
-    if (cache.ctx) {
-        ggml_free(cache.ctx);
-        cache.ctx = nullptr;
-    }
-}
-
-// load the model from a ggml file
-//
-// file format:
-//
-//   - hparams
-//   - pre-computed mel filters
-//   - vocab
-//   - weights
-//
-// see the convert-pt-to-ggml.py script for details
-//
-static bool whisper_model_load(struct whisper_model_loader * loader, whisper_context & wctx) {
-    log("%s: loading model\n", __func__);
-
-    const int64_t t_start_us = ggml_time_us();
-
-    wctx.t_start_us = t_start_us;
-
-    auto & model = wctx.model;
-    auto & vocab = wctx.vocab;
-
-    // verify magic
-    {
-        uint32_t magic;
-        read_safe(loader, magic);
-        if (magic != GGML_FILE_MAGIC) {
-            log("%s: invalid model data (bad magic)\n", __func__);
-            return false;
-        }
-    }
-
-    //load hparams
-    {
-        auto & hparams = model.hparams;
-
-        read_safe(loader, hparams.n_vocab);
-        read_safe(loader, hparams.n_audio_ctx);
-        read_safe(loader, hparams.n_audio_state);
-        read_safe(loader, hparams.n_audio_head);
-        read_safe(loader, hparams.n_audio_layer);
-        read_safe(loader, hparams.n_text_ctx);
-        read_safe(loader, hparams.n_text_state);
-        read_safe(loader, hparams.n_text_head);
-        read_safe(loader, hparams.n_text_layer);
-        read_safe(loader, hparams.n_mels);
-        read_safe(loader, hparams.ftype);
-
-        assert(hparams.n_text_state == hparams.n_audio_state);
-
-        if (hparams.n_audio_layer == 4) {
-            model.type = e_model::MODEL_TINY;
-        }
-
-        if (hparams.n_audio_layer == 6) {
-            model.type = e_model::MODEL_BASE;
-        }
-
-        if (hparams.n_audio_layer == 12) {
-            model.type = e_model::MODEL_SMALL;
-        }
-
-        if (hparams.n_audio_layer == 24) {
-            model.type = e_model::MODEL_MEDIUM;
-        }
-
-        if (hparams.n_audio_layer == 32) {
-            model.type = e_model::MODEL_LARGE;
-        }
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-
-        // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-        // in order to save memory and also to speed up the computation
-        wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-        if (wctx.wtype == GGML_TYPE_COUNT) {
-            log("%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
-            return false;
-        }
-
-        const size_t scale = model.hparams.ftype ? 1 : 2;
-
-        log("%s: n_vocab       = %d\n", __func__, hparams.n_vocab);
-        log("%s: n_audio_ctx   = %d\n", __func__, hparams.n_audio_ctx);
-        log("%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
-        log("%s: n_audio_head  = %d\n", __func__, hparams.n_audio_head);
-        log("%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
-        log("%s: n_text_ctx    = %d\n", __func__, hparams.n_text_ctx);
-        log("%s: n_text_state  = %d\n", __func__, hparams.n_text_state);
-        log("%s: n_text_head   = %d\n", __func__, hparams.n_text_head);
-        log("%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
-        log("%s: n_mels        = %d\n", __func__, hparams.n_mels);
-        log("%s: ftype         = %d\n", __func__, model.hparams.ftype);
-        log("%s: qntvr         = %d\n", __func__, qntvr);
-        log("%s: type          = %d\n", __func__, model.type);
-
-        // print memory requirements
-        {
-            // this is the total memory required to run the inference
-            const size_t mem_required =
-                     MEM_REQ_SCRATCH0.at(model.type) +
-                     MEM_REQ_SCRATCH1.at(model.type) +
-                     MEM_REQ_SCRATCH2.at(model.type) +
-                     MEM_REQ_SCRATCH3.at(model.type) +
-                scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type) +
-                scale*MEM_REQ_KV_CROSS.at(model.type) +
-                scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type));
-
-            // this is the memory required by one decoder
-            const size_t mem_required_decoder =
-                scale*MEM_REQ_KV_SELF.at(model.type);
-
-            log("%s: mem required  = %7.2f MB (+ %7.2f MB per decoder)\n", __func__,
-                    mem_required / 1024.0 / 1024.0, mem_required_decoder / 1024.0 / 1024.0);
-        }
-
-        // initialize all memory buffers
-        // always have at least one decoder
-
-        wctx.model.buf = new std::vector<uint8_t>();
-        wctx.model.buf->resize(scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type));
-
-        // we skip initialization of the state until it is needed
-        // because it might be that state will always be provided externally.
-    }
-
-    // load mel filters
-    {
-        auto & filters = wctx.model.filters;
-
-        read_safe(loader, filters.n_mel);
-        read_safe(loader, filters.n_fft);
-
-        filters.data.resize(filters.n_mel * filters.n_fft);
-        loader->read(loader->context, filters.data.data(), filters.data.size() * sizeof(float));
-        BYTESWAP_FILTERS(filters);
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        read_safe(loader, n_vocab);
-
-        //if (n_vocab != model.hparams.n_vocab) {
-        //    log("%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-        //            __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
-        //    return false;
-        //}
-
-        std::string word;
-        std::vector<char> tmp;
-
-        tmp.reserve(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            read_safe(loader, len);
-
-            if (len > 0) {
-                tmp.resize(len);
-                loader->read(loader->context, &tmp[0], tmp.size()); // read to buffer
-                word.assign(&tmp[0], tmp.size());
-            } else {
-                // seems like we have an empty-string token in multi-language models (i = 50256)
-                //log("%s: warning: empty-string token in vocab, i = %d\n", __func__, i);
-                word = "";
-            }
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-
-            //printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
-        }
-
-        vocab.n_vocab = model.hparams.n_vocab;
-        if (vocab.is_multilingual()) {
-            vocab.token_eot++;
-            vocab.token_sot++;
-            vocab.token_translate++;
-            vocab.token_transcribe++;
-            vocab.token_solm++;
-            vocab.token_prev++;
-            vocab.token_nosp++;
-            vocab.token_not++;
-            vocab.token_beg++;
-        }
-
-        if (n_vocab < model.hparams.n_vocab) {
-            log("%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab);
-            for (int i = n_vocab; i < model.hparams.n_vocab; i++) {
-                if (i > vocab.token_beg) {
-                    word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
-                } else if (i == vocab.token_eot) {
-                    word = "[_EOT_]";
-                } else if (i == vocab.token_sot) {
-                    word = "[_SOT_]";
-                } else if (i == vocab.token_solm) {
-                    word = "[_SOLM_]";
-                } else if (i == vocab.token_prev) {
-                    word = "[_PREV_]";
-                } else if (i == vocab.token_nosp) {
-                    word = "[_NOSP_]";
-                } else if (i == vocab.token_not) {
-                    word = "[_NOT_]";
-                } else if (i == vocab.token_beg) {
-                    word = "[_BEG_]";
-                } else {
-                    word = "[_extra_token_" + std::to_string(i) + "]";
-                }
-                vocab.token_to_id[word] = i;
-                vocab.id_to_token[i] = word;
-            }
-        }
-    }
-
-    size_t ctx_size = 0;
-
-    const ggml_type wtype = wctx.wtype;
-    const ggml_type vtype = wctx.wtype == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_vocab = hparams.n_vocab;
-
-        const int n_audio_ctx   = hparams.n_audio_ctx;
-        const int n_audio_state = hparams.n_audio_state;
-        const int n_audio_layer = hparams.n_audio_layer;
-
-        const int n_text_ctx   = hparams.n_text_ctx;
-        const int n_text_state = hparams.n_text_state;
-        const int n_text_layer = hparams.n_text_layer;
-
-        const int n_mels = hparams.n_mels;
-
-        // encoder
-        {
-            ctx_size += n_audio_ctx*n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_pe;
-
-            ctx_size += 3*n_mels*n_audio_state*ggml_type_sizef(vtype);         // e_conv_1_w
-            ctx_size +=          n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_1_b
-
-            ctx_size += 3*n_audio_state*n_audio_state*ggml_type_sizef(vtype);         // e_conv_2_w
-            ctx_size +=                 n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_2_b
-
-            ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_w;
-            ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_b;
-        }
-
-        // decoder
-        {
-            ctx_size += n_text_ctx*n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_pe;
-
-            ctx_size += n_vocab*n_text_state*ggml_type_sizef(wtype); // d_te;
-
-            ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_w;
-            ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_b;
-        }
-
-        // encoder layers
-        {
-            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w
-            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b
-
-            ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // mlp_0_w
-            ctx_size += n_audio_layer*(              4*n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b
-
-            ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // mlp_1_w
-            ctx_size += n_audio_layer*(                n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b
-
-            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w
-            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b
-
-            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // attn_q_w
-            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b
-
-            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_k_w
-
-            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // attn_v_w
-            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b
-
-            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // attn_ln_1_w
-            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b
-        }
-
-        // decoder layers
-        {
-            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w
-            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b
-
-            ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype));         // mlp_0_w
-            ctx_size += n_text_layer*(             4*n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b
-
-            ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype));         // mlp_1_w
-            ctx_size += n_text_layer*(               n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b
-
-            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w
-            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b
-
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // attn_q_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b
-
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_k_w
-
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // attn_v_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b
-
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // attn_ln_1_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b
-                                                                                                //
-            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_w
-            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_b
-
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // cross_attn_q_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_q_b
-
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_k_w
-
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // cross_attn_v_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_v_b
-
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // cross_attn_ln_1_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
-        }
-
-        ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead
-
-        log("%s: model ctx     = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ wctx.model.buf->size(),
-            /*.mem_buffer =*/ wctx.model.buf->data(),
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            log("%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        auto & ctx = model.ctx;
-
-        const auto & hparams = model.hparams;
-
-        const int n_vocab = hparams.n_vocab;
-
-        const int n_audio_ctx   = hparams.n_audio_ctx;
-        const int n_audio_state = hparams.n_audio_state;
-        const int n_audio_layer = hparams.n_audio_layer;
-
-        const int n_text_ctx   = hparams.n_text_ctx;
-        const int n_text_state = hparams.n_text_state;
-        const int n_text_layer = hparams.n_text_layer;
-
-        const int n_mels = hparams.n_mels;
-
-        model.layers_encoder.resize(n_audio_layer);
-        model.layers_decoder.resize(n_text_layer);
-
-        // encoder
-        {
-            model.e_pe       = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx);
-
-            model.e_conv_1_w = ggml_new_tensor_3d(ctx, vtype,         3, n_mels, n_audio_state);
-            model.e_conv_1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
-
-            model.e_conv_2_w = ggml_new_tensor_3d(ctx, vtype,         3, n_audio_state, n_audio_state);
-            model.e_conv_2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
-
-            model.e_ln_w     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
-            model.e_ln_b     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
-
-            // map by name
-            model.tensors["encoder.positional_embedding"] = model.e_pe;
-
-            model.tensors["encoder.conv1.weight"]         = model.e_conv_1_w;
-            model.tensors["encoder.conv1.bias"]           = model.e_conv_1_b;
-
-            model.tensors["encoder.conv2.weight"]         = model.e_conv_2_w;
-            model.tensors["encoder.conv2.bias"]           = model.e_conv_2_b;
-
-            model.tensors["encoder.ln_post.weight"]       = model.e_ln_w;
-            model.tensors["encoder.ln_post.bias"]         = model.e_ln_b;
-
-            for (int i = 0; i < n_audio_layer; ++i) {
-                auto & layer = model.layers_encoder[i];
-
-                layer.mlp_ln_w    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
-                layer.mlp_ln_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
-
-                layer.mlp_0_w     = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, 4*n_audio_state);
-                layer.mlp_0_b     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_audio_state);
-
-                layer.mlp_1_w     = ggml_new_tensor_2d(ctx, wtype,         4*n_audio_state, n_audio_state);
-                layer.mlp_1_b     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
-
-                layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
-                layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
-
-                layer.attn_q_w    = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
-                layer.attn_q_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
-
-                layer.attn_k_w    = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
-
-                layer.attn_v_w    = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
-                layer.attn_v_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
-
-                layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
-                layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
-
-                // map by name
-                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.weight"]     = layer.mlp_ln_w;
-                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.bias"]       = layer.mlp_ln_b;
-
-                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.weight"]      = layer.mlp_0_w;
-                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.bias"]        = layer.mlp_0_b;
-
-                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.weight"]      = layer.mlp_1_w;
-                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.bias"]        = layer.mlp_1_b;
-
-                model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.weight"]    = layer.attn_ln_0_w;
-                model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.bias"]      = layer.attn_ln_0_b;
-
-                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w;
-                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.bias"]   = layer.attn_q_b;
-
-                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.key.weight"]   = layer.attn_k_w;
-
-                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w;
-                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.bias"]   = layer.attn_v_b;
-
-                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.weight"]   = layer.attn_ln_1_w;
-                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.bias"]     = layer.attn_ln_1_b;
-            }
-        }
-
-        // decoder
-        {
-            model.d_pe   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_text_state, n_text_ctx);
-
-            model.d_te   = ggml_new_tensor_2d(ctx, wtype,         n_text_state, n_vocab);
-
-            model.d_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
-            model.d_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
-
-            // map by name
-            model.tensors["decoder.positional_embedding"]   = model.d_pe;
-
-            model.tensors["decoder.token_embedding.weight"] = model.d_te;
-
-            model.tensors["decoder.ln.weight"]              = model.d_ln_w;
-            model.tensors["decoder.ln.bias"]                = model.d_ln_b;
-
-            for (int i = 0; i < n_text_layer; ++i) {
-                auto & layer = model.layers_decoder[i];
-
-                layer.mlp_ln_w          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-                layer.mlp_ln_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-
-                layer.mlp_0_w           = ggml_new_tensor_2d(ctx, wtype,           n_text_state, 4*n_text_state);
-                layer.mlp_0_b           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_text_state);
-
-                layer.mlp_1_w           = ggml_new_tensor_2d(ctx, wtype,         4*n_text_state, n_text_state);
-                layer.mlp_1_b           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-
-                layer.attn_ln_0_w       = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-                layer.attn_ln_0_b       = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-
-                layer.attn_q_w          = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-                layer.attn_q_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-
-                layer.attn_k_w          = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-
-                layer.attn_v_w          = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-                layer.attn_v_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-
-                layer.attn_ln_1_w       = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-                layer.attn_ln_1_b       = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-
-                layer.cross_attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-                layer.cross_attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-
-                layer.cross_attn_q_w    = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-                layer.cross_attn_q_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-
-                layer.cross_attn_k_w    = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-
-                layer.cross_attn_v_w    = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-                layer.cross_attn_v_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-
-                layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-                layer.cross_attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-
-                // map by name
-                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.weight"]           = layer.mlp_ln_w;
-                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.bias"]             = layer.mlp_ln_b;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.weight"]            = layer.mlp_0_w;
-                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.bias"]              = layer.mlp_0_b;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.weight"]            = layer.mlp_1_w;
-                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.bias"]              = layer.mlp_1_b;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.weight"]          = layer.attn_ln_0_w;
-                model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.bias"]            = layer.attn_ln_0_b;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.weight"]       = layer.attn_q_w;
-                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.bias"]         = layer.attn_q_b;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.key.weight"]         = layer.attn_k_w;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.weight"]       = layer.attn_v_w;
-                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.bias"]         = layer.attn_v_b;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.weight"]         = layer.attn_ln_1_w;
-                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.bias"]           = layer.attn_ln_1_b;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.weight"]    = layer.cross_attn_ln_0_w;
-                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.bias"]      = layer.cross_attn_ln_0_b;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.weight"] = layer.cross_attn_q_w;
-                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.bias"]   = layer.cross_attn_q_b;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.key.weight"]   = layer.cross_attn_k_w;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.weight"] = layer.cross_attn_v_w;
-                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.bias"]   = layer.cross_attn_v_b;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.weight"]   = layer.cross_attn_ln_1_w;
-                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.bias"]     = layer.cross_attn_ln_1_b;
-            }
-        }
-    }
-
-    // load weights
-    {
-        size_t total_size = 0;
-
-        model.n_loaded = 0;
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            read_safe(loader, n_dims);
-            read_safe(loader, length);
-            read_safe(loader, ttype);
-
-            if (loader->eof(loader->context)) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[4] = { 1, 1, 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                read_safe(loader, ne[i]);
-                nelements *= ne[i];
-            }
-
-            std::string name;
-            std::vector<char> tmp(length); // create a buffer
-            loader->read(loader->context, &tmp[0], tmp.size()); // read to buffer
-            name.assign(&tmp[0], tmp.size());
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                log("%s: unknown tensor '%s' in model file\n", __func__, name.data());
-                return false;
-            }
-
-            auto tensor = model.tensors[name.data()];
-            if (ggml_nelements(tensor) != nelements) {
-                log("%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
-                log("%s: shape: [%d, %d, %d], expected: [%d, %d, %d]\n",
-                        __func__, ne[0], ne[1], ne[2], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2]);
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
-                log("%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n",
-                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], ne[0], ne[1], ne[2]);
-                return false;
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                log("%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
-            BYTESWAP_TENSOR(tensor);
-
-            //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype), ggml_nbytes(tensor)/1024.0/1024.0);
-            total_size += ggml_nbytes(tensor);
-            model.n_loaded++;
-        }
-
-        log("%s: model size    = %7.2f MB\n", __func__, total_size/1024.0/1024.0);
-
-        if (model.n_loaded == 0) {
-            log("%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
-        } else if (model.n_loaded != (int) model.tensors.size()) {
-            log("%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
-            return false;
-        }
-    }
-
-    wctx.t_load_us = ggml_time_us() - t_start_us;
-
-    return true;
-}
-
-// evaluate the encoder with the given state
-//
-// given audio recording (more specifically, its log mel spectrogram), runs forward pass of the encoder
-// part of the transformer model and returns the encoded features
-//
-//   - wctx:      the model
-//   - wstate:     the state of the encoder
-//   - n_threads:  number of threads to use
-//   - mel_offset: offset in the mel spectrogram (i.e. audio offset)
-//
-static bool whisper_encode_internal(
-        whisper_context & wctx,
-          whisper_state & wstate,
-              const int   mel_offset,
-              const int   n_threads){
-
-    const int64_t t_start_us = ggml_time_us();
-
-    const auto & model   = wctx.model;
-    const auto & mel_inp = wstate.mel;
-    const auto & hparams = model.hparams;
-
-    const int n_ctx   = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
-    const int n_state = hparams.n_audio_state;
-    const int n_head  = hparams.n_audio_head;
-    const int n_layer = hparams.n_audio_layer;
-
-    const int n_mels = hparams.n_mels;
-    assert(mel_inp.n_mel == n_mels);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ wstate.buf_compute.size(),
-        /*.mem_buffer =*/ wstate.buf_compute.data(),
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    wstate.use_buf(ctx0, 0);
-
-    struct ggml_tensor * mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels);
-    assert(mel->type == GGML_TYPE_F32);
-    {
-        float * dst = (float *) mel->data;
-        memset(dst, 0, ggml_nbytes(mel));
-
-        const int i0 = std::min(mel_offset, mel_inp.n_len);
-        const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len);
-
-        for (int j = 0; j < mel_inp.n_mel; ++j) {
-            for (int i = i0; i < i1; ++i) {
-                dst[j*2*n_ctx + (i - i0)] = mel_inp.data[j*mel_inp.n_len + i];
-            }
-        }
-    }
-
-    struct ggml_tensor * cur;
-
-#ifndef WHISPER_USE_COREML
-    const bool use_coreml = false;
-#else
-    const bool use_coreml = wstate.ctx_coreml != nullptr;
-#endif
-
-#ifndef WHISPER_USE_OPENVINO
-    const bool use_openvino = false;
-#else
-    const bool use_openvino = wstate.ctx_openvino != nullptr;
-#endif
-
-    if (!use_coreml && !use_openvino) {
-        // convolution + gelu
-        {
-            wstate.use_buf(ctx0, 1);
-
-            cur = ggml_conv_1d_ph(ctx0, model.e_conv_1_w, mel, 1, 1);
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0,
-                        model.e_conv_1_b,
-                        cur),
-                    cur);
-
-            cur = ggml_gelu(ctx0, cur);
-
-            wstate.use_buf(ctx0, 0);
-
-            cur = ggml_conv_1d_ph(ctx0, model.e_conv_2_w, cur, 2, 1);
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0,
-                        model.e_conv_2_b,
-                        cur),
-                    cur);
-
-            cur = ggml_gelu(ctx0, cur);
-        }
-
-        wstate.use_buf(ctx0, 3);
-
-        // ===================================================================
-        // NOTE: experimenting with partial evaluation of the encoder (ignore)
-        //static int iter = -1;
-        //const int n_iter = 1500/n_ctx;
-
-        //iter = (iter + 1) % n_iter;
-
-        //if (iter == 0) {
-        //    memset(model.memory_cross_k->data, 0, ggml_nbytes(model.memory_cross_k));
-        //    memset(model.memory_cross_v->data, 0, ggml_nbytes(model.memory_cross_v));
-        //}
-
-        static int iter = 0;
-
-        const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe);
-        const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter;
-
-        struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
-
-        cur = ggml_add(ctx0, e_pe, ggml_transpose(ctx0, cur));
-
-        // ===================================================================
-
-        // original:
-        //cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur));
-
-        struct ggml_tensor * inpL = cur;
-
-        for (int il = 0; il < n_layer; ++il) {
-            const auto & layer = model.layers_encoder[il];
-
-            // norm
-            {
-                wstate.use_buf(ctx0, 0);
-
-                cur = ggml_norm(ctx0, inpL, hparams.eps);
-
-                // cur = ln_0_w*cur + ln_0_b
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
-                            cur),
-                        ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
-            }
-
-            // self-attention
-            {
-                wstate.use_buf(ctx0, 1);
-
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
-                        layer.attn_q_w,
-                        cur);
-
-                Qcur = ggml_add(ctx0,
-                        ggml_repeat(ctx0,
-                            layer.attn_q_b,
-                            Qcur),
-                        Qcur);
-
-                //Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
-
-                // note: no bias for Key
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
-                        layer.attn_k_w,
-                        cur);
-
-                //Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
-
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
-                        layer.attn_v_w,
-                        cur);
-
-                Vcur = ggml_add(ctx0,
-                        ggml_repeat(ctx0,
-                            layer.attn_v_b,
-                            Vcur),
-                        Vcur);
-
-                // ------
-
-                wstate.use_buf(ctx0, 0);
-
-#ifdef WHISPER_USE_FLASH_ATTN
-                struct ggml_tensor * Q =
-                    ggml_permute(ctx0,
-                            ggml_cpy(ctx0,
-                                Qcur,
-                                ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
-                            0, 2, 1, 3);
-
-                struct ggml_tensor * K =
-                    ggml_permute(ctx0,
-                            ggml_cpy(ctx0,
-                                Kcur,
-                                ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
-                            0, 2, 1, 3);
-
-                struct ggml_tensor * V =
-                    ggml_cpy(ctx0,
-                            ggml_permute(ctx0,
-                                ggml_reshape_3d(ctx0,
-                                    Vcur,
-                                    n_state/n_head, n_head, n_ctx),
-                                1, 2, 0, 3),
-                            ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head));
-
-                struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
-#else
-                struct ggml_tensor * Q =
-                    ggml_permute(ctx0,
-                            ggml_cpy(ctx0,
-                                Qcur,
-                                ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)),
-                            0, 2, 1, 3);
-
-                struct ggml_tensor * K =
-                    ggml_permute(ctx0,
-                            ggml_cpy(ctx0,
-                                Kcur,
-                                ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
-                            0, 2, 1, 3);
-
-                // K * Q
-                struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-                struct ggml_tensor * KQ_scaled =
-                    ggml_scale_inplace(ctx0,
-                            KQ,
-                            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
-                            );
-
-                struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);
-
-                struct ggml_tensor * V =
-                    ggml_cpy(ctx0,
-                            ggml_permute(ctx0,
-                                ggml_reshape_3d(ctx0,
-                                    Vcur,
-                                    n_state/n_head, n_head, n_ctx),
-                                1, 2, 0, 3),
-                            ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head)
-                            );
-
-                struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-#endif
-                struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-                wstate.use_buf(ctx0, 1);
-
-                cur = ggml_cpy(ctx0,
-                        KQV_merged,
-                        ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx));
-            }
-
-            // projection
-            {
-                wstate.use_buf(ctx0, 0);
-
-                cur = ggml_mul_mat(ctx0,
-                        layer.attn_ln_1_w,
-                        cur);
-
-                wstate.use_buf(ctx0, 1);
-
-                cur = ggml_add(ctx0,
-                        ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
-                        cur);
-            }
-
-            wstate.use_buf(ctx0, 2);
-
-            // add the input
-            cur = ggml_add(ctx0, cur, inpL);
-
-            struct ggml_tensor * inpFF = cur;
-
-            // feed-forward network
-            {
-                // norm
-                {
-                    wstate.use_buf(ctx0, 0);
-
-                    cur = ggml_norm(ctx0, inpFF, hparams.eps);
-
-                    wstate.use_buf(ctx0, 1);
-
-                    // cur = mlp_ln_w*cur + mlp_ln_b
-                    cur = ggml_add(ctx0,
-                            ggml_mul(ctx0,
-                                ggml_repeat(ctx0, layer.mlp_ln_w, cur),
-                                cur),
-                            ggml_repeat(ctx0, layer.mlp_ln_b, cur));
-                }
-
-#ifdef WHISPER_USE_FLASH_FF
-                wstate.use_buf(ctx0, 0);
-
-                cur = ggml_flash_ff(ctx0,
-                        ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.itype, n_state, n_ctx)),
-                        layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
-#else
-                wstate.use_buf(ctx0, 0);
-
-                // fully connected
-                cur = ggml_mul_mat(ctx0,
-                        layer.mlp_0_w,
-                        cur);
-
-                wstate.use_buf(ctx0, 1);
-
-                cur = ggml_add(ctx0,
-                        ggml_repeat(ctx0, layer.mlp_0_b, cur),
-                        cur);
-
-                wstate.use_buf(ctx0, 0);
-
-                // GELU activation
-                cur = ggml_gelu(ctx0, cur);
-
-                wstate.use_buf(ctx0, 1);
-
-                // projection
-                cur = ggml_mul_mat(ctx0,
-                        layer.mlp_1_w,
-                        cur);
-
-                wstate.use_buf(ctx0, 0);
-
-                cur = ggml_add(ctx0,
-                        ggml_repeat(ctx0, layer.mlp_1_b, cur),
-                        cur);
-#endif
-            }
-
-            wstate.use_buf(ctx0, 3);
-
-            inpL = ggml_add(ctx0, cur, inpFF);
-        }
-
-        cur = inpL;
-
-        // norm
-        {
-            wstate.use_buf(ctx0, 0);
-
-            cur = ggml_norm(ctx0, cur, hparams.eps);
-
-            wstate.use_buf(ctx0, 1);
-
-            // cur = ln_f_g*cur + ln_f_b
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.e_ln_w, cur),
-                        cur),
-                    ggml_repeat(ctx0, model.e_ln_b, cur));
-        }
-
-        wstate.use_buf(ctx0, -1);
-
-        // run the computation
-        {
-            struct ggml_cgraph gf = {};
-
-            ggml_build_forward_expand  (&gf, cur);
-            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-
-            //ggml_graph_print(&gf);
-        }
-    }
-#ifdef WHISPER_USE_COREML
-    else if (use_coreml) {
-        wstate.use_buf(ctx0, -1);
-
-        cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
-
-        whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
-    }
-#endif
-#ifdef WHISPER_USE_OPENVINO
-    else if (use_openvino) {
-        wstate.use_buf(ctx0, -1);
-
-        cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
-
-        if (!whisper_openvino_encode(wstate.ctx_openvino, mel, cur)) {
-            return false;
-        }
-    }
-#endif
-
-    // cur
-    //{
-    //    printf("ne0 = %d\n", cur->ne[0]);
-    //    printf("ne1 = %d\n", cur->ne[1]);
-    //    for (int i = 0; i < 10; ++i) {
-    //        printf("%8.4f ", ((float *)(cur->data))[i]);
-    //    }
-    //    printf("... ");
-    //    for (int i = cur->ne[0] - 10; i < cur->ne[0]; ++i) {
-    //        printf("%8.4f ", ((float *)(cur->data))[i]);
-    //    }
-    //    printf("\n");
-    //}
-
-    // pre-compute cross-attention memory
-    {
-        struct ggml_cgraph gf = {};
-
-        // TODO: hack to disconnect the encoded features from the previous graph
-        cur->op = GGML_OP_NONE;
-        cur->src[0] = nullptr;
-        cur->src[1] = nullptr;
-
-        for (int il = 0; il < model.hparams.n_text_layer; ++il) {
-            auto& layer = model.layers_decoder[il];
-
-            wstate.use_buf(ctx0, 0);
-
-            struct ggml_tensor* Kcross = ggml_mul_mat(ctx0,
-                layer.cross_attn_k_w,
-                cur);
-
-            Kcross = ggml_scale_inplace(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
-
-            wstate.use_buf(ctx0, 1);
-
-            struct ggml_tensor* Vcross = ggml_mul_mat(ctx0,
-                layer.cross_attn_v_w,
-                cur);
-
-            Vcross = ggml_add(ctx0,
-                ggml_repeat(ctx0,
-                    layer.cross_attn_v_b,
-                    Vcross),
-                Vcross);
-
-            wstate.use_buf(ctx0, -1);
-
-            Vcross = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcross, n_state, n_ctx));
-
-            struct ggml_tensor * k = ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx));
-            struct ggml_tensor * v = ggml_view_2d(ctx0, wstate.kv_cross.v, n_ctx, n_state,
-                    (   n_ctx)*ggml_element_size(wstate.kv_cross.v),
-                    (il*n_ctx)*ggml_element_size(wstate.kv_cross.v)*n_state);
-
-            ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcross, k));
-            ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v));
-        }
-
-        ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-        //ggml_graph_print(&gf);
-    }
-
-    ////////////////////////////////////////////////////////////////////////////
-
-    //printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
-    //        ggml_used_mem(ctx0)/1024.0/1024.0,
-    //        wstate.get_buf_max_mem(0)/1024.0/1024.0,
-    //        wstate.get_buf_max_mem(1)/1024.0/1024.0,
-    //        wstate.get_buf_max_mem(2)/1024.0/1024.0,
-    //        wstate.get_buf_max_mem(3)/1024.0/1024.0);
-
-    ggml_free(ctx0);
-
-    wstate.t_encode_us += ggml_time_us() - t_start_us;
-    wstate.n_encode++;
-
-    return true;
-}
-
-// evaluate the decoder
-//
-// given text prompt + audio features -> computes the logits for the next token
-//
-//   - model:      the model
-//   - n_threads:  number of threads to use
-//   - tokens:     text prompt
-//   - n_tokens:   number of tokens in the prompt
-//   - n_past:     number of past tokens to prefix the prompt with
-//
-static bool whisper_decode_internal(
-        whisper_context & wctx,
-          whisper_state & wstate,
-        whisper_decoder & decoder,
-    const whisper_token * tokens,
-              const int   n_tokens,
-              const int   n_past,
-              const int   n_threads) {
-    const int64_t t_start_us = ggml_time_us();
-
-    const auto & model   = wctx.model;
-    const auto & hparams = model.hparams;
-
-    auto & kv_self = decoder.kv_self;
-
-    WHISPER_ASSERT(!!kv_self.ctx);
-
-    auto & logits_out = wstate.logits;
-
-    const int n_vocab = hparams.n_vocab;
-
-    const int n_ctx   = hparams.n_text_ctx;
-    const int n_state = hparams.n_text_state;
-    const int n_head  = hparams.n_text_head;
-    const int n_layer = hparams.n_text_layer;
-
-    const int N = n_tokens;
-    const int M = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
-
-    //WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ wstate.buf_compute.size(),
-        /*.mem_buffer =*/ wstate.buf_compute.data(),
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    struct ggml_cgraph gf = {};
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, tokens, N*ggml_element_size(embd));
-
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    for (int i = 0; i < N; ++i) {
-        ((int32_t *) position->data)[i] = n_past + i;
-    }
-
-    wstate.use_buf(ctx0, 3);
-
-    // token encoding + position encoding
-    struct ggml_tensor * cur =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.d_te, embd),
-                ggml_get_rows(ctx0, model.d_pe, position));
-
-    struct ggml_tensor * inpL = cur;
-
-    for (int il = 0; il < n_layer; ++il) {
-        const auto & layer = model.layers_decoder[il];
-
-        // norm
-        {
-            wstate.use_buf(ctx0, 0);
-
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
-
-            // cur = ln_0_w*cur + ln_0_b
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
-                        cur),
-                    ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
-        }
-
-        // self-attention
-        {
-            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
-                    layer.attn_q_w,
-                    cur);
-
-            Qcur = ggml_add(ctx0,
-                    ggml_repeat(ctx0,
-                        layer.attn_q_b,
-                        Qcur),
-                    Qcur);
-
-            Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
-
-            // note: no bias for Key
-            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
-                    layer.attn_k_w,
-                    cur);
-
-            Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
-
-            // store key and value to memory
-            {
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
-                        layer.attn_v_w,
-                        cur);
-
-                Vcur = ggml_add(ctx0,
-                        ggml_repeat(ctx0,
-                            layer.attn_v_b,
-                            Vcur),
-                        Vcur);
-
-                Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_state, N));
-
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_state, (ggml_element_size(kv_self.k)*n_state)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_state,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_state + n_past*ggml_element_size(kv_self.v));
-
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // ------
-
-            wstate.use_buf(ctx0, 0);
-
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, N)),
-                        0, 2, 1, 3);
-
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_state, il*n_ctx*ggml_element_size(kv_self.k)*n_state),
-                            n_state/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            wstate.use_buf(ctx0, 1);
-
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            //struct ggml_tensor * KQ_scaled =
-            //    ggml_scale_inplace(ctx0,
-            //            KQ,
-            //            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
-            //            );
-
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ, n_past);
-
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
-                        n_past + N, n_state/n_head, n_head,
-                        n_ctx*ggml_element_size(kv_self.v),
-                        n_ctx*ggml_element_size(kv_self.v)*n_state/n_head,
-                        il*n_ctx*ggml_element_size(kv_self.v)*n_state);
-
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, N));
-        }
-
-        // projection
-        {
-            wstate.use_buf(ctx0, 0);
-
-            cur = ggml_mul_mat(ctx0,
-                    layer.attn_ln_1_w,
-                    cur);
-
-            wstate.use_buf(ctx0, 1);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
-                    cur);
-        }
-
-        wstate.use_buf(ctx0, 2);
-
-        // add the input
-        struct ggml_tensor * inpCA = ggml_add(ctx0, cur, inpL);
-
-        // norm
-        {
-            wstate.use_buf(ctx0, 0);
-
-            cur = ggml_norm(ctx0, inpCA, hparams.eps); // note: we use inpCA here
-
-            // cur = ln_0_w*cur + ln_0_b
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, layer.cross_attn_ln_0_w, cur),
-                        cur),
-                    ggml_repeat(ctx0, layer.cross_attn_ln_0_b, cur));
-        }
-
-        // cross-attention
-        {
-            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
-                    layer.cross_attn_q_w,
-                    cur);
-
-            Qcur = ggml_add(ctx0,
-                    ggml_repeat(ctx0,
-                        layer.cross_attn_q_b,
-                        Qcur),
-                    Qcur);
-
-            Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
-
-            // Kcross is already scaled
-            struct ggml_tensor * Kcross =
-                ggml_reshape_3d(ctx0,
-                        ggml_view_1d(ctx0, wstate.kv_cross.k, M*n_state, il*M*ggml_element_size(wstate.kv_cross.k)*n_state),
-                        n_state/n_head, n_head, M);
-
-            //struct ggml_tensor * Vcross =
-            //    ggml_reshape_3d(ctx0,
-            //            ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*ggml_element_size(wstate.kv_cross.v)*n_state),
-            //            n_state/n_head, n_head, M);
-
-            //struct ggml_tensor * V_trans =
-            //    ggml_cpy(ctx0,
-            //            ggml_permute(ctx0, Vcross, 1, 2, 0, 3),
-            //            ggml_new_tensor_3d(ctx0, Vcross->type, M, n_state/n_head, n_head));
-
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, wstate.kv_cross.v,
-                        M, n_state/n_head, n_head,
-                        M*ggml_element_size(wstate.kv_cross.v),
-                        M*ggml_element_size(wstate.kv_cross.v)*n_state/n_head,
-                        il*M*ggml_element_size(wstate.kv_cross.v)*n_state);
-
-            // ------
-
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, N)),
-                        0, 2, 1, 3);
-
-            struct ggml_tensor * K = ggml_permute(ctx0, Kcross, 0, 2, 1, 3);
-
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            //struct ggml_tensor * KQ_scaled =
-            //    ggml_scale_inplace(ctx0,
-            //            KQ,
-            //            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
-            //            );
-
-            // no masking for cross-attention
-            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
-
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ);
-
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_state, N)
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, N));
-        }
-
-        // projection
-        {
-            wstate.use_buf(ctx0, 0);
-
-            cur = ggml_mul_mat(ctx0,
-                    layer.cross_attn_ln_1_w,
-                    cur);
-
-            wstate.use_buf(ctx0, 1);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, layer.cross_attn_ln_1_b, cur),
-                    cur);
-        }
-
-        wstate.use_buf(ctx0, 2);
-
-        // add the input
-        cur = ggml_add(ctx0, cur, inpCA);
-
-        struct ggml_tensor * inpFF = cur;
-
-        // feed-forward network
-        {
-            // norm
-            {
-                wstate.use_buf(ctx0, 0);
-
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
-
-                wstate.use_buf(ctx0, 1);
-
-                // cur = mlp_ln_w*cur + mlp_ln_b
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, layer.mlp_ln_w, cur),
-                            cur),
-                        ggml_repeat(ctx0, layer.mlp_ln_b, cur));
-            }
-
-            wstate.use_buf(ctx0, 0);
-
-            // fully connected
-            cur = ggml_mul_mat(ctx0,
-                    layer.mlp_0_w,
-                    cur);
-
-            wstate.use_buf(ctx0, 1);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, layer.mlp_0_b, cur),
-                    cur);
-
-            wstate.use_buf(ctx0, 0);
-
-            // GELU activation
-            cur = ggml_gelu(ctx0, cur);
-
-            wstate.use_buf(ctx0, 1);
-
-            // projection
-            cur = ggml_mul_mat(ctx0,
-                    layer.mlp_1_w,
-                    cur);
-
-            wstate.use_buf(ctx0, 0);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, layer.mlp_1_b, cur),
-                    cur);
-        }
-
-        wstate.use_buf(ctx0, 3);
-
-        inpL = ggml_add(ctx0, cur, inpFF);
-    }
-
-    cur = inpL;
-
-    // norm
-    {
-        wstate.use_buf(ctx0, 0);
-
-        cur = ggml_norm(ctx0, cur, hparams.eps);
-
-        wstate.use_buf(ctx0, 1);
-
-        cur = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.d_ln_w, cur),
-                    cur),
-                ggml_repeat(ctx0, model.d_ln_b, cur));
-    }
-
-    wstate.use_buf(ctx0, 0);
-
-    // compute logits only for the last token
-    // comment this line to compute logits for all N tokens
-    // might be useful in the future
-    cur = ggml_view_2d(ctx0, cur, cur->ne[0], 1, cur->nb[1], (cur->ne[1] - 1)*cur->nb[1]);
-
-    struct ggml_tensor * logits = ggml_mul_mat(ctx0, model.d_te, cur);
-
-    wstate.use_buf(ctx0, -1);
-
-    // run the computation
-    {
-        ggml_build_forward_expand  (&gf, logits);
-        ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-    }
-
-    // extract logits for all N tokens
-    //logits_out.resize(N*n_vocab);
-    //memcpy(logits_out.data(), ggml_get_data(logits), sizeof(float)*N*n_vocab);
-
-    // extract logits only for the last token
-    logits_out.resize(n_vocab);
-    memcpy(logits_out.data(), ggml_get_data(logits), sizeof(float)*n_vocab);
-
-    if (N > 1) {
-        //printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
-        //        ggml_used_mem(ctx0)/1024.0/1024.0,
-        //        wstate.get_buf_max_mem(0)/1024.0/1024.0,
-        //        wstate.get_buf_max_mem(1)/1024.0/1024.0,
-        //        wstate.get_buf_max_mem(2)/1024.0/1024.0,
-        //        wstate.get_buf_max_mem(3)/1024.0/1024.0);
-    }
-
-    ggml_free(ctx0);
-
-    wstate.t_decode_us += ggml_time_us() - t_start_us;
-    wstate.n_decode++;
-
-    return true;
-}
-
-//  500 -> 00:05.000
-// 6000 -> 01:00.000
-static std::string to_timestamp(int64_t t, bool comma = false) {
-    int64_t msec = t * 10;
-    int64_t hr = msec / (1000 * 60 * 60);
-    msec = msec - hr * (1000 * 60 * 60);
-    int64_t min = msec / (1000 * 60);
-    msec = msec - min * (1000 * 60);
-    int64_t sec = msec / 1000;
-    msec = msec - sec * 1000;
-
-    char buf[32];
-    snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
-
-    return std::string(buf);
-}
-
-#define SIN_COS_N_COUNT WHISPER_N_FFT
-static float sin_vals[SIN_COS_N_COUNT];
-static float cos_vals[SIN_COS_N_COUNT];
-
-// In FFT, we frequently use sine and cosine operations with the same values.
-// We can use precalculated values to speed up the process.
-static void fill_sin_cos_table() {
-    static bool is_filled = false;
-    if (is_filled) return;
-    for (int i = 0; i < SIN_COS_N_COUNT; i++) {
-        double theta = (2*M_PI*i)/SIN_COS_N_COUNT;
-        sin_vals[i] = sinf(theta);
-        cos_vals[i] = cosf(theta);
-    }
-    is_filled = true;
-}
-
-// naive Discrete Fourier Transform
-// input is real-valued
-// output is complex-valued
-static void dft(const std::vector<float> & in, std::vector<float> & out) {
-    int N = in.size();
-
-    out.resize(N*2);
-    const int sin_cos_step = SIN_COS_N_COUNT / N;
-
-    for (int k = 0; k < N; k++) {
-        float re = 0;
-        float im = 0;
-
-        for (int n = 0; n < N; n++) {
-            int idx = (k * n * sin_cos_step) % (SIN_COS_N_COUNT); // t = 2*M_PI*k*n/N
-            re += in[n]*cos_vals[idx]; // cos(t)
-            im -= in[n]*sin_vals[idx]; // sin(t)
-        }
-
-        out[k*2 + 0] = re;
-        out[k*2 + 1] = im;
-    }
-}
-
-// Cooley-Tukey FFT
-// poor man's implementation - use something better
-// input is real-valued
-// output is complex-valued
-static void fft(const std::vector<float> & in, std::vector<float> & out) {
-    out.resize(in.size()*2);
-
-    int N = in.size();
-
-    if (N == 1) {
-        out[0] = in[0];
-        out[1] = 0;
-        return;
-    }
-
-    if (N%2 == 1) {
-        dft(in, out);
-        return;
-    }
-
-    std::vector<float> even;
-    std::vector<float> odd;
-
-    even.reserve(N/2);
-    odd.reserve(N/2);
-
-    for (int i = 0; i < N; i++) {
-        if (i % 2 == 0) {
-            even.push_back(in[i]);
-        } else {
-            odd.push_back(in[i]);
-        }
-    }
-
-    std::vector<float> even_fft;
-    std::vector<float> odd_fft;
-
-    fft(even, even_fft);
-    fft(odd, odd_fft);
-
-    const int sin_cos_step = SIN_COS_N_COUNT / N;
-    for (int k = 0; k < N/2; k++) {
-        int idx = k * sin_cos_step; // t = 2*M_PI*k/N
-        float re = cos_vals[idx]; // cos(t)
-        float im = -sin_vals[idx]; // sin(t)
-
-        float re_odd = odd_fft[2*k + 0];
-        float im_odd = odd_fft[2*k + 1];
-
-        out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd;
-        out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd;
-
-        out[2*(k + N/2) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd;
-        out[2*(k + N/2) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd;
-    }
-}
-
-static bool hann_window(int length, bool periodic, std::vector<float> & output) {
-    if (output.size() < static_cast<size_t>(length)) {
-        output.resize(length);
-    }
-    int offset = -1;
-    if (periodic) {
-        offset = 0;
-    }
-    for (int i = 0; i < length; i++) {
-        output[i] = 0.5*(1.0 - cosf((2.0*M_PI*i)/(length + offset)));
-    }
-
-    return true;
-}
-
-static void log_mel_spectrogram_worker_thread(int ith, const std::vector<float> & hann, const std::vector<float> & samples,
-                                              int n_samples, int frame_size, int frame_step, int n_threads,
-                                              const whisper_filters & filters, whisper_mel & mel) {
-    std::vector<float> fft_in(frame_size, 0.0);
-    std::vector<float> fft_out(2 * frame_step);
-    // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist
-    int n_fft = 1 + (frame_size / 2);
-    int i = ith;
-
-    // calculate FFT only when fft_in are not all zero
-    for (; i < std::min(n_samples / frame_step + 1, mel.n_len); i += n_threads) {
-        const int offset = i * frame_step;
-
-        // apply Hanning window (~10% faster)
-        for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) {
-            fft_in[j] = hann[j] * samples[offset + j];
-        }
-        // fill the rest with zeros
-        if (n_samples - offset < frame_size) {
-            std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0);
-        }
-
-        // FFT
-        fft(fft_in, fft_out);
-
-        // Calculate modulus^2 of complex numbers
-        // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting.
-        for (int j = 0; j < frame_size; j++) {
-            fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]);
-        }
-
-        // mel spectrogram
-        for (int j = 0; j < mel.n_mel; j++) {
-            double sum = 0.0;
-
-            // unroll loop (suggested by GH user @lunixbochs)
-            int k = 0;
-            for (k = 0; k < n_fft - 3; k += 4) {
-                sum +=
-                        fft_out[k + 0] * filters.data[j * n_fft + k + 0] +
-                        fft_out[k + 1] * filters.data[j * n_fft + k + 1] +
-                        fft_out[k + 2] * filters.data[j * n_fft + k + 2] +
-                        fft_out[k + 3] * filters.data[j * n_fft + k + 3];
-            }
-
-            // handle n_fft remainder
-            for (; k < n_fft; k++) {
-                sum += fft_out[k] * filters.data[j * n_fft + k];
-            }
-
-            sum = log10(std::max(sum, 1e-10));
-
-            mel.data[j * mel.n_len + i] = sum;
-        }
-    }
-
-    // Otherwise fft_out are all zero
-    double sum = log10(1e-10);
-    for (; i < mel.n_len; i += n_threads) {
-        for (int j = 0; j < mel.n_mel; j++) {
-            mel.data[j * mel.n_len + i] = sum;
-        }
-    }
-}
-
-// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157
-static bool log_mel_spectrogram(
-              whisper_state & wstate,
-              const float * samples,
-              const int   n_samples,
-              const int   /*sample_rate*/,
-              const int   frame_size,
-              const int   frame_step,
-              const int   n_mel,
-              const int   n_threads,
-              const whisper_filters & filters,
-              const bool   debug,
-              whisper_mel & mel) {
-    const int64_t t_start_us = ggml_time_us();
-
-    // Hanning window (Use cosf to eliminate difference)
-    // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html
-    // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147
-    std::vector<float> hann;
-    hann_window(frame_size, true, hann);
-
-
-    // Calculate the length of padding
-    int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30;
-    int64_t stage_2_pad = frame_size / 2;
-
-    // Initialize a vector and copy data from C array to it.
-    std::vector<float> samples_padded;
-    samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2);
-    std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad);
-
-    // pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio
-    std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0);
-
-    // reflective pad 200 samples at the beginning of audio
-    std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin());
-
-    mel.n_mel     = n_mel;
-    // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/SpectralOps.cpp#L936
-    // Calculate number of frames + remove the last frame
-    mel.n_len     = (samples_padded.size() - frame_size) / frame_step;
-    // Calculate semi-padded sample length to ensure compatibility
-    mel.n_len_org = 1 + (n_samples + stage_2_pad - frame_size) / frame_step;
-    mel.data.resize(mel.n_mel * mel.n_len);
-
-
-    {
-        std::vector<std::thread> workers(n_threads - 1);
-        for (int iw = 0; iw < n_threads - 1; ++iw) {
-            workers[iw] = std::thread(
-                    log_mel_spectrogram_worker_thread, iw + 1, std::cref(hann), samples_padded,
-                    n_samples + stage_2_pad, frame_size, frame_step, n_threads,
-                    std::cref(filters), std::ref(mel));
-        }
-
-        // main thread
-        log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples + stage_2_pad, frame_size, frame_step, n_threads, filters, mel);
-
-        for (int iw = 0; iw < n_threads - 1; ++iw) {
-            workers[iw].join();
-        }
-    }
-
-    // clamping and normalization
-    double mmax = -1e20;
-    for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
-        if (mel.data[i] > mmax) {
-            mmax = mel.data[i];
-        }
-    }
-
-    mmax -= 8.0;
-
-    for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
-        if (mel.data[i] < mmax) {
-            mel.data[i] = mmax;
-        }
-
-        mel.data[i] = (mel.data[i] + 4.0)/4.0;
-    }
-
-    wstate.t_mel_us += ggml_time_us() - t_start_us;
-
-    // Dump log_mel_spectrogram
-    if (debug) {
-        std::ofstream outFile("log_mel_spectrogram.json");
-        outFile << "[";
-        for (uint64_t i = 0; i < mel.data.size() - 1; i++) {
-            outFile << mel.data[i] << ", ";
-        }
-        outFile << mel.data[mel.data.size() - 1] << "]";
-        outFile.close();
-    }
-
-    return true;
-}
-
-// split text into tokens
-//
-// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
-//
-// Regex (Python):
-// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
-//
-// Regex (C++):
-// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
-//
-static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, const std::string & text) {
-    std::vector<std::string> words;
-
-    // first split the text into words
-    {
-        std::string str = text;
-        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
-
-        std::regex re(pat);
-        std::smatch m;
-
-        while (std::regex_search(str, m, re)) {
-            for (auto x : m) {
-                words.push_back(x);
-            }
-            str = m.suffix();
-        }
-    }
-
-    // find the longest tokens that form the words:
-    std::vector<whisper_vocab::id> tokens;
-    for (const auto & word : words) {
-        if (word.empty()) continue;
-
-        int i = 0;
-        int n = word.size();
-        while (i < n) {
-            int j = n;
-            bool found = false;
-            while (j > i) {
-                auto sub = word.substr(i, j-i);
-                auto it = vocab.token_to_id.find(sub);
-                if (it != vocab.token_to_id.end()) {
-                    tokens.push_back(it->second);
-                    i = j;
-                    found = true;
-                    break;
-                }
-                --j;
-            }
-            if (!found) {
-                log("unknown token\n");
-                ++i;
-            }
-        }
-    }
-
-    return tokens;
-}
-
-//
-// interface implementation
-//
-
-#ifdef WHISPER_USE_COREML
-// replace .bin with -encoder.mlmodelc
-static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
-    auto pos = path_bin.rfind('.');
-    if (pos != std::string::npos) {
-        path_bin = path_bin.substr(0, pos);
-    }
-
-    // match "-qx_x"
-    pos = path_bin.rfind('-');
-    if (pos != std::string::npos) {
-        auto sub = path_bin.substr(pos);
-        if (sub.size() == 5 && sub[1] == 'q' && sub[3] == '_') {
-            path_bin = path_bin.substr(0, pos);
-        }
-    }
-
-    path_bin += "-encoder.mlmodelc";
-
-    return path_bin;
-}
-#endif
-
-#ifdef WHISPER_USE_OPENVINO
-// replace .bin with-encoder-openvino.xml
-static std::string whisper_openvino_get_path_encoder(std::string path_bin) {
-    auto pos = path_bin.rfind('.');
-    if (pos != std::string::npos) {
-        path_bin = path_bin.substr(0, pos);
-    }
-
-    path_bin += "-encoder-openvino.xml";
-
-    return path_bin;
-}
-
-static std::string whisper_openvino_get_path_cache(std::string path_bin) {
-    auto pos = path_bin.rfind('.');
-    if (pos != std::string::npos) {
-        path_bin = path_bin.substr(0, pos);
-    }
-
-    path_bin += "-encoder-openvino-cache";
-
-    return path_bin;
-}
-#endif
-
-struct whisper_state * whisper_init_state(whisper_context * ctx) {
-    fill_sin_cos_table();
-    whisper_state * state = new whisper_state;
-
-    const size_t scale = ctx->model.hparams.ftype ? 1 : 2;
-
-    if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->itype, ctx->model.hparams.n_text_ctx)) {
-        log("%s: kv_cache_init() failed for self-attention cache\n", __func__);
-        delete state;
-        return nullptr;
-    }
-
-    {
-        const size_t memory_size = ggml_nbytes(state->decoders[0].kv_self.k) + ggml_nbytes(state->decoders[0].kv_self.v);
-        log("%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
-    }
-
-    if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
-        log("%s: kv_cache_init() failed for cross-attention cache\n", __func__);
-        delete state;
-        return nullptr;
-    }
-
-    {
-        const size_t memory_size = ggml_nbytes(state->kv_cross.k) + ggml_nbytes(state->kv_cross.v);
-        log("%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
-    }
-
-#ifdef WHISPER_USE_COREML
-    const auto path_coreml = whisper_get_coreml_path_encoder(ctx->path_model);
-
-    log("%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
-    log("%s: first run on a device may take a while ...\n", __func__);
-
-    state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
-    if (!state->ctx_coreml) {
-        log("%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
-#ifndef WHISPER_COREML_ALLOW_FALLBACK
-        return nullptr;
-#endif
-    } else {
-        log("%s: Core ML model loaded\n", __func__);
-    }
-#endif
-
-    state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
-
-    state->logits_id.reserve(ctx->model.hparams.n_vocab);
-
-    // TAGS: WHISPER_DECODER_INIT
-    state->decoders[0].sequence.tokens.reserve(ctx->model.hparams.n_text_ctx);
-
-    state->decoders[0].probs.reserve(ctx->vocab.n_vocab);
-    state->decoders[0].logits.reserve(ctx->vocab.n_vocab);
-    state->decoders[0].logprobs.reserve(ctx->vocab.n_vocab);
-    state->buf_compute.resize(scale * std::max(MEM_REQ_ENCODE.at(ctx->model.type), MEM_REQ_DECODE.at(ctx->model.type)));
-
-    state->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
-    state->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
-    state->buf_scratch[2].resize(MEM_REQ_SCRATCH2.at(ctx->model.type));
-    state->buf_scratch[3].resize(MEM_REQ_SCRATCH3.at(ctx->model.type));
-
-    state->rng = std::mt19937(0);
-
-    return state;
-}
-
-int whisper_ctx_init_openvino_encoder(
-        struct whisper_context * ctx,
-                    const char * model_path,
-                    const char * device,
-                    const char * cache_dir) {
-#ifndef WHISPER_USE_OPENVINO
-    (void)(ctx);
-    (void)(model_path);
-    (void)(device);
-    (void)(cache_dir);
-
-    return 1;
-#else
-    if (!model_path && ctx->path_model.empty()) {
-        log("%s: model_path is nullptr, and ctx has no model_path set.\n", __func__);
-        return 1;
-    }
-
-    std::string path_encoder;
-    if (!model_path) {
-        //if model_path is not set, attempt to find it in the same directory as ggml-<model>.bin model
-        path_encoder = whisper_openvino_get_path_encoder(ctx->path_model);
-    } else {
-        path_encoder = model_path;
-    }
-
-    std::string path_cache;
-    if (!cache_dir) {
-        //if cache_dir is not set, set it as a dir residing next to ggml-<model>.bin
-        path_cache = whisper_openvino_get_path_cache(ctx->path_model);
-    } else {
-        path_cache = cache_dir;
-    }
-
-    log("%s: loading OpenVINO model from '%s'\n", __func__, path_encoder.c_str());
-    log("%s: first run on a device may take a while ...\n", __func__);
-
-    ctx->state->ctx_openvino = whisper_openvino_init(path_encoder.c_str(), device, path_cache.c_str());
-    if (!ctx->state->ctx_openvino) {
-        log("%s: failed to init OpenVINO encoder from '%s'\n", __func__, path_encoder.c_str());
-        return 1;
-    } else {
-        log("%s: OpenVINO model loaded\n", __func__);
-    }
-
-    return 0;
-#endif
-}
-
-struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
-
-    log("%s: loading model from '%s'\n", __func__, path_model);
-
-    auto fin = std::ifstream(path_model, std::ios::binary);
-    if (!fin) {
-        log("%s: failed to open '%s'\n", __func__, path_model);
-        return nullptr;
-    }
-
-    whisper_model_loader loader = {};
-
-    loader.context = &fin;
-
-    loader.read = [](void * ctx, void * output, size_t read_size) {
-        std::ifstream * fin = (std::ifstream*)ctx;
-        fin->read((char *)output, read_size);
-        return read_size;
-    };
-
-    loader.eof = [](void * ctx) {
-        std::ifstream * fin = (std::ifstream*)ctx;
-        return fin->eof();
-    };
-
-    loader.close = [](void * ctx) {
-        std::ifstream * fin = (std::ifstream*)ctx;
-        fin->close();
-    };
-
-    auto ctx = whisper_init_no_state(&loader);
-
-    if (ctx) {
-        ctx->path_model = path_model;
-    }
-
-    return ctx;
-}
-
-struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size) {
-    struct buf_context {
-        uint8_t* buffer;
-        size_t size;
-        size_t current_offset;
-    };
-
-    buf_context ctx = { reinterpret_cast<uint8_t*>(buffer), buffer_size, 0 };
-
-    log("%s: loading model from buffer\n", __func__);
-
-    whisper_model_loader loader = {};
-
-    loader.context = &ctx;
-
-    loader.read = [](void * ctx, void * output, size_t read_size) {
-        buf_context * buf = reinterpret_cast<buf_context *>(ctx);
-
-        size_t size_to_copy = buf->current_offset + read_size < buf->size ? read_size : buf->size - buf->current_offset;
-
-        memcpy(output, buf->buffer + buf->current_offset, size_to_copy);
-        buf->current_offset += size_to_copy;
-
-        return size_to_copy;
-    };
-
-    loader.eof = [](void * ctx) {
-        buf_context * buf = reinterpret_cast<buf_context *>(ctx);
-
-        return buf->current_offset >= buf->size;
-    };
-
-    loader.close = [](void * /*ctx*/) { };
-
-    return whisper_init_no_state(&loader);
-}
-
-struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader) {
-    ggml_time_init();
-
-    whisper_context * ctx = new whisper_context;
-
-    if (!whisper_model_load(loader, *ctx)) {
-        loader->close(loader->context);
-        log("%s: failed to load model\n", __func__);
-        delete ctx;
-        return nullptr;
-    }
-
-    loader->close(loader->context);
-
-    return ctx;
-}
-
-struct whisper_context * whisper_init_from_file(const char * path_model) {
-    whisper_context * ctx = whisper_init_from_file_no_state(path_model);
-    if (!ctx) {
-        return nullptr;
-    }
-
-    ctx->state = whisper_init_state(ctx);
-    if (!ctx->state) {
-        whisper_free(ctx);
-        return nullptr;
-    }
-
-    return ctx;
-}
-
-struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size) {
-    whisper_context * ctx = whisper_init_from_buffer_no_state(buffer, buffer_size);
-    if (!ctx) {
-        return nullptr;
-    }
-
-    ctx->state = whisper_init_state(ctx);
-    if (!ctx->state) {
-        whisper_free(ctx);
-        return nullptr;
-    }
-
-    return ctx;
-}
-
-struct whisper_context * whisper_init(struct whisper_model_loader * loader) {
-    whisper_context * ctx = whisper_init_no_state(loader);
-    if (!ctx) {
-        return nullptr;
-    }
-
-    ctx->state = whisper_init_state(ctx);
-    if (!ctx->state) {
-        whisper_free(ctx);
-        return nullptr;
-    }
-
-    return ctx;
-}
-
-void whisper_free_state(struct whisper_state * state)
-{
-    if (state) {
-        kv_cache_free(state->kv_cross);
-
-        for (int i = 0; i < WHISPER_MAX_DECODERS; ++i) {
-            kv_cache_free(state->decoders[i].kv_self);
-        }
-
-#ifdef WHISPER_USE_COREML
-        if (state->ctx_coreml != nullptr) {
-            whisper_coreml_free(state->ctx_coreml);
-            state->ctx_coreml = nullptr;
-        }
-#endif
-
-#ifdef WHISPER_USE_OPENVINO
-        if (state->ctx_openvino != nullptr) {
-            whisper_openvino_free(state->ctx_openvino);
-            state->ctx_openvino = nullptr;
-        }
-#endif
-
-        delete state;
-    }
-}
-
-void whisper_free(struct whisper_context * ctx) {
-    if (ctx) {
-        if (ctx->model.ctx) {
-            ggml_free(ctx->model.ctx);
-        }
-        if (ctx->model.buf) {
-            delete ctx->model.buf;
-        }
-
-        whisper_free_state(ctx->state);
-
-        delete ctx;
-    }
-}
-
-void whisper_free_params(struct whisper_full_params * params) {
-    if (params) {
-        delete params;
-    }
-}
-
-int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
-    if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
-        log("%s: failed to compute mel spectrogram\n", __func__);
-        return -1;
-    }
-
-    return 0;
-}
-
-int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
-    return whisper_pcm_to_mel_with_state(ctx, ctx->state, samples, n_samples, n_threads);
-}
-
-// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
-int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
-    if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
-        log("%s: failed to compute mel spectrogram\n", __func__);
-        return -1;
-    }
-
-    return 0;
-}
-
-// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
-int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
-    return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads);
-}
-
-// same as whisper_pcm_to_mel, but applies WSOLA to speed up the audio x2
-// TODO
-
-// same as whisper_pcm_to_mel, but applies HPTSM to speed up the audio x2
-// TODO
-
-// same as whisper_pcm_to_mel, but applies PV (with phase lock) to speed up the audio x2
-// TODO
-
-int whisper_set_mel_with_state(
-        struct whisper_context * /*ctx*/,
-          struct whisper_state * state,
-                   const float * data,
-                           int   n_len,
-                           int   n_mel) {
-    if (n_mel != WHISPER_N_MEL) {
-        log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, WHISPER_N_MEL);
-        return -1;
-    }
-
-    state->mel.n_len     = n_len;
-    state->mel.n_len_org = n_len;
-    state->mel.n_mel     = n_mel;
-
-    state->mel.data.resize(n_len*n_mel);
-    memcpy(state->mel.data.data(), data, n_len*n_mel*sizeof(float));
-
-    return 0;
-}
-
-int whisper_set_mel(
-        struct whisper_context * ctx,
-        const float * data,
-        int n_len,
-        int n_mel) {
-    return whisper_set_mel_with_state(ctx, ctx->state, data, n_len, n_mel);
-}
-
-int whisper_encode_with_state(struct whisper_context * ctx, struct whisper_state * state, int offset, int n_threads) {
-    if (!whisper_encode_internal(*ctx, *state, offset, n_threads)) {
-        log("%s: failed to eval\n", __func__);
-        return -1;
-    }
-
-    return 0;
-}
-
-int whisper_encode(struct whisper_context * ctx, int offset, int n_threads) {
-    if (!whisper_encode_internal(*ctx, *ctx->state, offset, n_threads)) {
-        log("%s: failed to eval\n", __func__);
-        return -1;
-    }
-
-    return 0;
-}
-
-int whisper_decode_with_state(struct whisper_context * ctx, struct whisper_state * state, const whisper_token * tokens, int n_tokens, int n_past, int n_threads) {
-    const int selected_decoder_id = 0;
-
-    if (!whisper_decode_internal(*ctx, *state, state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
-        log("%s: failed to eval\n", __func__);
-        return 1;
-    }
-
-    return 0;
-}
-
-int whisper_decode(struct whisper_context * ctx, const whisper_token * tokens, int n_tokens, int n_past, int n_threads) {
-    // TODO: add selected_decoder_id to state
-    const int selected_decoder_id = 0;
-
-    if (ctx->state == nullptr) {
-        log("%s: ERROR state was not loaded.\n", __func__);
-        return false;
-    }
-
-    if (!whisper_decode_internal(*ctx, *ctx->state, ctx->state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
-        log("%s: failed to eval\n", __func__);
-        return 1;
-    }
-
-    return 0;
-}
-
-int whisper_tokenize(struct whisper_context * ctx, const char * text, whisper_token * tokens, int n_max_tokens) {
-    const auto res = tokenize(ctx->vocab, text);
-
-    if (n_max_tokens < (int) res.size()) {
-        log("%s: too many resulting tokens: %d (max %d)\n", __func__, (int) res.size(), n_max_tokens);
-        return -1;
-    }
-
-    for (int i = 0; i < (int) res.size(); i++) {
-        tokens[i] = res[i];
-    }
-
-    return res.size();
-}
-
-int whisper_lang_max_id() {
-    auto max_id = 0;
-    for (const auto & kv : g_lang) {
-        max_id = std::max(max_id, kv.second.first);
-    }
-
-    return max_id;
-}
-
-int whisper_lang_id(const char * lang) {
-    if (!g_lang.count(lang)) {
-        for (const auto & kv : g_lang) {
-            if (kv.second.second == lang) {
-                return kv.second.first;
-            }
-        }
-
-        log("%s: unknown language '%s'\n", __func__, lang);
-        return -1;
-    }
-    return g_lang.at(lang).first;
-}
-
-const char * whisper_lang_str(int id) {
-    for (const auto & kv : g_lang) {
-        if (kv.second.first == id) {
-            return kv.first.c_str();
-        }
-    }
-
-    log("%s: unknown language id %d\n", __func__, id);
-    return nullptr;
-}
-
-int whisper_lang_auto_detect_with_state(
-        struct whisper_context * ctx,
-          struct whisper_state * state,
-                           int   offset_ms,
-                           int   n_threads,
-                         float * lang_probs) {
-    const int seek = offset_ms/10;
-
-    if (seek < 0) {
-        log("%s: offset %dms is before the start of the audio\n", __func__, offset_ms);
-        return -1;
-    }
-
-    if (seek >= state->mel.n_len_org) {
-        log("%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, state->mel.n_len_org*10);
-        return -2;
-    }
-
-    // run the encoder
-    if (whisper_encode_with_state(ctx, state, seek, n_threads) != 0) {
-        log("%s: failed to encode\n", __func__);
-        return -6;
-    }
-
-    const std::vector<whisper_token> prompt = { whisper_token_sot(ctx) };
-
-    if (whisper_decode_with_state(ctx, state, prompt.data(), prompt.size(), 0, n_threads) != 0) {
-        log("%s: failed to decode\n", __func__);
-        return -7;
-    }
-
-    auto & logits_id = state->logits_id;
-    logits_id.clear();
-
-    for (const auto & kv : g_lang) {
-        const auto token_lang = whisper_token_lang(ctx, kv.second.first);
-        logits_id.emplace_back(state->logits[token_lang], kv.second.first);
-    }
-
-    // sort descending
-    {
-        using pair_type = std::remove_reference<decltype(logits_id)>::type::value_type;
-        std::sort(logits_id.begin(), logits_id.end(), [](const pair_type & a, const pair_type & b) {
-            return a.first > b.first;
-        });
-    }
-
-    // softmax
-    {
-        const auto max = logits_id[0].first;
-
-        double sum = 0.0f;
-        for (auto & kv : logits_id) {
-            kv.first = exp(kv.first - max);
-            sum += kv.first;
-        }
-
-        for (auto & kv : logits_id) {
-            kv.first /= sum;
-        }
-    }
-
-    {
-        for (const auto & prob : logits_id) {
-            if (lang_probs) {
-                lang_probs[prob.second] = prob.first;
-            }
-
-            //printf("%s: lang %2d (%3s): %f\n", __func__, prob.second, whisper_lang_str(prob.second), prob.first);
-        }
-    }
-
-    return logits_id[0].second;
-}
-
-int whisper_lang_auto_detect(
-        struct whisper_context * ctx,
-                           int   offset_ms,
-                           int   n_threads,
-                         float * lang_probs) {
-    return whisper_lang_auto_detect_with_state(ctx, ctx->state, offset_ms, n_threads, lang_probs);
-}
-
-int whisper_model_n_vocab(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_vocab;
-}
-
-int whisper_model_n_audio_ctx(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_audio_ctx;
-}
-
-int whisper_model_n_audio_state(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_audio_state;
-}
-
-int whisper_model_n_audio_head(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_audio_head;
-}
-
-int whisper_model_n_audio_layer(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_audio_layer;
-}
-
-int whisper_model_n_text_ctx(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_text_ctx;
-}
-
-int whisper_model_n_text_state(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_text_state;
-}
-
-int whisper_model_n_text_head(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_text_head;
-}
-
-int whisper_model_n_text_layer(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_text_layer;
-}
-
-int whisper_model_n_mels(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_mels;
-}
-
-int whisper_model_ftype(struct whisper_context * ctx) {
-    return ctx->model.hparams.ftype;
-}
-
-int whisper_model_type(struct whisper_context * ctx) {
-    return ctx->model.type;
-}
-
-const char *whisper_model_type_readable(struct whisper_context * ctx) {
-    switch (ctx->model.type) {
-    case e_model::MODEL_TINY:
-        return "tiny";
-    case e_model::MODEL_BASE:
-        return "base";
-    case e_model::MODEL_SMALL:
-        return "small";
-    case e_model::MODEL_MEDIUM:
-        return "medium";
-    case e_model::MODEL_LARGE:
-        return "large";
-    default:
-        return "unknown";
-    }
-}
-
-int whisper_n_len_from_state(struct whisper_state * state) {
-    return state->mel.n_len_org;
-}
-
-int whisper_n_len(struct whisper_context * ctx) {
-    return ctx->state->mel.n_len_org;
-}
-
-int whisper_n_vocab(struct whisper_context * ctx) {
-    return ctx->vocab.n_vocab;
-}
-
-int whisper_n_text_ctx(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_text_ctx;
-}
-
-int whisper_n_audio_ctx(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_audio_ctx;
-}
-
-int whisper_is_multilingual(struct whisper_context * ctx) {
-    return ctx->vocab.is_multilingual() ? 1 : 0;
-}
-
-float * whisper_get_logits(struct whisper_context * ctx) {
-    return ctx->state->logits.data();
-}
-
-float * whisper_get_logits_from_state(struct whisper_state * state) {
-    return state->logits.data();
-}
-
-const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token) {
-    return ctx->vocab.id_to_token.at(token).c_str();
-}
-
-whisper_token whisper_token_eot(struct whisper_context * ctx) {
-    return ctx->vocab.token_eot;
-}
-
-whisper_token whisper_token_sot(struct whisper_context * ctx) {
-    return ctx->vocab.token_sot;
-}
-
-whisper_token whisper_token_solm(struct whisper_context * ctx) {
-    return ctx->vocab.token_solm;
-}
-
-whisper_token whisper_token_prev(struct whisper_context * ctx) {
-    return ctx->vocab.token_prev;
-}
-
-whisper_token whisper_token_nosp(struct whisper_context * ctx) {
-    return ctx->vocab.token_nosp;
-}
-
-whisper_token whisper_token_not(struct whisper_context * ctx) {
-    return ctx->vocab.token_not;
-}
-
-whisper_token whisper_token_beg(struct whisper_context * ctx) {
-    return ctx->vocab.token_beg;
-}
-
-whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id) {
-    return whisper_token_sot(ctx) + 1 + lang_id;
-}
-
-whisper_token whisper_token_translate(struct whisper_context * ctx) {
-    return ctx->vocab.token_translate;
-}
-
-whisper_token whisper_token_transcribe(struct whisper_context * ctx) {
-    return ctx->vocab.token_transcribe;
-}
-
-void whisper_print_timings(struct whisper_context * ctx) {
-    const int64_t t_end_us = ggml_time_us();
-
-    log("\n");
-    log("%s:     load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
-    if (ctx->state != nullptr) {
-
-        const int32_t n_sample = std::max(1, ctx->state->n_sample);
-        const int32_t n_encode = std::max(1, ctx->state->n_encode);
-        const int32_t n_decode = std::max(1, ctx->state->n_decode);
-
-        log("%s:     fallbacks = %3d p / %3d h\n", __func__, ctx->state->n_fail_p, ctx->state->n_fail_h);
-        log("%s:      mel time = %8.2f ms\n", __func__, ctx->state->t_mel_us / 1000.0f);
-        log("%s:   sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_sample_us, n_sample, 1e-3f * ctx->state->t_sample_us / n_sample);
-        log("%s:   encode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_encode_us, n_encode, 1e-3f * ctx->state->t_encode_us / n_encode);
-        log("%s:   decode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_decode_us, n_decode, 1e-3f * ctx->state->t_decode_us / n_decode);
-    }
-    log("%s:    total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
-}
-
-void whisper_reset_timings(struct whisper_context * ctx) {
-    if (ctx->state != nullptr) {
-        ctx->state->t_sample_us = 0;
-        ctx->state->t_encode_us = 0;
-        ctx->state->t_decode_us = 0;
-    }
-}
-
-static int whisper_has_coreml(void) {
-#ifdef WHISPER_USE_COREML
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-static int whisper_has_openvino(void) {
-#ifdef WHISPER_USE_OPENVINO
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-const char * whisper_print_system_info(void) {
-    static std::string s;
-
-    s  = "";
-    s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
-    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
-    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
-    s += "FMA = "       + std::to_string(ggml_cpu_has_fma())       + " | ";
-    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
-    s += "ARM_FMA = "   + std::to_string(ggml_cpu_has_arm_fma())   + " | ";
-    s += "F16C = "      + std::to_string(ggml_cpu_has_f16c())      + " | ";
-    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
-    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
-    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
-    s += "SSE3 = "      + std::to_string(ggml_cpu_has_sse3())      + " | ";
-    s += "SSSE3 = "     + std::to_string(ggml_cpu_has_ssse3())     + " | ";
-    s += "VSX = "       + std::to_string(ggml_cpu_has_vsx())       + " | ";
-    s += "COREML = "    + std::to_string(whisper_has_coreml())     + " | ";
-    s += "OPENVINO = "  + std::to_string(whisper_has_openvino())   + " | ";
-
-    return s.c_str();
-}
-
-////////////////////////////////////////////////////////////////////////////
-
-struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy) {
-    struct whisper_full_params params = whisper_full_default_params(strategy);
-
-    struct whisper_full_params* result = new whisper_full_params();
-    *result = params;
-    return result;
-}
-
-struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
-    struct whisper_full_params result = {
-        /*.strategy          =*/ strategy,
-
-        /*.n_threads         =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
-        /*.n_max_text_ctx    =*/ 16384,
-        /*.offset_ms         =*/ 0,
-        /*.duration_ms       =*/ 0,
-
-        /*.translate         =*/ false,
-        /*.no_context        =*/ true,
-        /*.single_segment    =*/ false,
-        /*.print_special     =*/ false,
-        /*.print_progress    =*/ true,
-        /*.print_realtime    =*/ false,
-        /*.print_timestamps  =*/ true,
-
-        /*.token_timestamps  =*/ false,
-        /*.thold_pt          =*/ 0.01f,
-        /*.thold_ptsum       =*/ 0.01f,
-        /*.max_len           =*/ 0,
-        /*.split_on_word     =*/ false,
-        /*.max_tokens        =*/ 0,
-
-        /*.speed_up          =*/ false,
-        /*.debug_mode        =*/ false,
-        /*.audio_ctx         =*/ 0,
-
-        /*.tdrz_enable       =*/ false,
-
-        /*.initial_prompt    =*/ nullptr,
-        /*.prompt_tokens     =*/ nullptr,
-        /*.prompt_n_tokens   =*/ 0,
-
-        /*.language          =*/ "en",
-        /*.detect_language   =*/ false,
-
-        /*.suppress_blank    =*/ true,
-        /*.suppress_non_speech_tokens =*/ false,
-
-        /*.temperature       =*/  0.0f,
-        /*.max_initial_ts    =*/  1.0f,
-        /*.length_penalty    =*/ -1.0f,
-
-        /*.temperature_inc   =*/  0.4f,
-        /*.entropy_thold     =*/  2.4f,
-        /*.logprob_thold     =*/ -1.0f,
-        /*.no_speech_thold   =*/  0.6f,
-
-        /*.greedy            =*/ {
-            /*.best_of   =*/ -1,
-        },
-
-        /*.beam_search      =*/ {
-            /*.beam_size =*/ -1,
-
-            /*.patience  =*/ -1.0f,
-        },
-
-        /*.new_segment_callback           =*/ nullptr,
-        /*.new_segment_callback_user_data =*/ nullptr,
-
-        /*.progress_callback           =*/ nullptr,
-        /*.progress_callback_user_data =*/ nullptr,
-
-        /*.encoder_begin_callback           =*/ nullptr,
-        /*.encoder_begin_callback_user_data =*/ nullptr,
-
-        /*.logits_filter_callback           =*/ nullptr,
-        /*.logits_filter_callback_user_data =*/ nullptr,
-    };
-
-    switch (strategy) {
-        case WHISPER_SAMPLING_GREEDY:
-            {
-                result.greedy = {
-                    /*.best_of   =*/ 2, // TODO: increase to 5 when we speed-up batch decoding
-                };
-            } break;
-        case WHISPER_SAMPLING_BEAM_SEARCH:
-            {
-                result.beam_search = {
-                    /*.beam_size =*/ 2, // TODO: increase to 5 when we speed-up batch decoding
-
-                    /*.patience  =*/ -1.0f,
-                };
-            } break;
-    }
-
-    return result;
-}
-
-// forward declarations
-static std::vector<float> get_signal_energy(const float * signal, int n_samples, int n_samples_per_half_window);
-static void whisper_exp_compute_token_level_timestamps(
-        struct whisper_context & ctx,
-          struct whisper_state & state,
-                           int   i_segment,
-                         float   thold_pt,
-                         float   thold_ptsum);
-
-static inline bool should_split_on_word(const char * txt, bool split_on_word) {
-    if (!split_on_word) return true;
-
-    return txt[0] == ' ';
-}
-
-// wrap the last segment to max_len characters
-// returns the number of new segments
-static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_state & state, int max_len, bool split_on_word) {
-    auto segment = state.result_all.back();
-
-    int res = 1;
-    int acc = 0;
-
-    std::string text;
-
-    for (int i = 0; i < (int) segment.tokens.size(); i++) {
-        const auto & token = segment.tokens[i];
-        if (token.id >= whisper_token_eot(&ctx)) {
-            continue;
-        }
-
-        const auto txt = whisper_token_to_str(&ctx, token.id);
-        const int cur = strlen(txt);
-
-        if (acc + cur > max_len && i > 0 && should_split_on_word(txt, split_on_word)) {
-            state.result_all.back().text = std::move(text);
-            state.result_all.back().t1 = token.t0;
-            state.result_all.back().tokens.resize(i);
-            state.result_all.back().speaker_turn_next = false;
-
-            state.result_all.push_back({});
-            state.result_all.back().t0 = token.t0;
-            state.result_all.back().t1 = segment.t1;
-
-            // add tokens [i, end] to the new segment
-            state.result_all.back().tokens.insert(
-                state.result_all.back().tokens.end(),
-                    segment.tokens.begin() + i,
-                    segment.tokens.end());
-
-            state.result_all.back().speaker_turn_next = segment.speaker_turn_next;
-
-            acc = 0;
-            text = "";
-
-            segment = state.result_all.back();
-            i = -1;
-
-            res++;
-        } else {
-            acc += cur;
-            text += txt;
-        }
-    }
-
-    state.result_all.back().text = std::move(text);
-
-    return res;
-}
-
-static const std::vector<std::string> non_speech_tokens = {
-    "\"", "#", "(", ")", "*", "+", "/", ":", ";", "<", "=", ">", "@", "[", "\\", "]", "^",
-    "_", "`", "{", "|", "}", "~", "「", "」", "『", "』", "<<", ">>", "<<<", ">>>", "--",
-    "---", "-(", "-[", "('", "(\"", "((", "))", "(((", ")))", "[[", "]]", "{{", "}}", "♪♪",
-    "♪♪♪","♩", "♪", "♫", "♬", "♭", "♮", "♯"
-};
-
-// process the logits for the selected decoder
-// - applies logit filters
-// - computes logprobs and probs
-static void whisper_process_logits(
-              struct whisper_context & ctx,
-               struct whisper_state  & state,
-    const struct whisper_full_params   params,
-              struct whisper_decoder & decoder,
-                               float   temperature) {
-    const auto & vocab      = ctx.vocab;
-    const auto & tokens_cur = decoder.sequence.tokens;
-
-    const bool is_initial = tokens_cur.size() == 0;
-    const int  n_logits   = vocab.id_to_token.size();
-
-    WHISPER_ASSERT(n_logits == ctx.vocab.n_vocab);
-
-    // extract the logits for the last token
-    // we will be mutating, and therefore we don't want to use the ctx.logits buffer directly
-    auto & probs    = decoder.probs;
-    auto & logits   = decoder.logits;
-    auto & logprobs = decoder.logprobs;
-    {
-        logits.resize(n_logits);
-        memcpy(logits.data(), state.logits.data() + (state.logits.size() - n_logits), n_logits*sizeof(float));
-
-        if (temperature > 0.0f) {
-            for (int i = 0; i < n_logits; i++) {
-                logits[i] /= temperature;
-            }
-        }
-
-        // will be populated a bit later
-        probs.resize(n_logits);
-        logprobs.resize(n_logits);
-    }
-
-    // apply logit filters here
-    // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L480-L493
-    {
-        // suppress blank
-        // https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L388-L390
-        if (params.suppress_blank) {
-            if (is_initial) {
-                logits[vocab.token_eot]           = -INFINITY;
-                logits[vocab.token_to_id.at(" ")] = -INFINITY;
-            }
-        }
-
-        // suppress <|notimestamps|> token
-        // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L410-L412
-        logits[vocab.token_not] = -INFINITY;
-
-        // suppress sot and nosp tokens
-        logits[vocab.token_sot]  = -INFINITY;
-        logits[vocab.token_nosp] = -INFINITY; // TODO: ignore this token for now
-
-        // [TDRZ] when tinydiarize is disabled, suppress solm token
-        if (params.tdrz_enable == false) {
-            logits[vocab.token_solm] = -INFINITY;
-        }
-
-        // suppress task tokens
-        logits[vocab.token_translate]  = -INFINITY;
-        logits[vocab.token_transcribe] = -INFINITY;
-
-        if (params.logits_filter_callback) {
-            params.logits_filter_callback(&ctx, &state, tokens_cur.data(), tokens_cur.size(), logits.data(), params.logits_filter_callback_user_data);
-        }
-
-        // suppress non-speech tokens
-        // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
-        if (params.suppress_non_speech_tokens) {
-            for (const std::string & token : non_speech_tokens) {
-                const std::string suppress_tokens[] = {token, " " + token};
-                for (const std::string & suppress_token : suppress_tokens) {
-                    if (vocab.token_to_id.find(suppress_token) != vocab.token_to_id.end()) {
-                        logits[vocab.token_to_id.at(suppress_token)] = -INFINITY;
-                    }
-                }
-            }
-
-            // allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
-            if (vocab.token_to_id.find(" -") != vocab.token_to_id.end()) {
-                logits[vocab.token_to_id.at(" -")] = -INFINITY;
-            }
-            if (vocab.token_to_id.find(" '") != vocab.token_to_id.end()) {
-                logits[vocab.token_to_id.at(" '")] = -INFINITY;
-            }
-        }
-
-        // timestamps have to appear in pairs, except directly before EOT; mask logits accordingly
-        // https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L414-L424
-        {
-            const bool last_was_timestamp        = tokens_cur.size() > 0 && tokens_cur.back().id >= vocab.token_beg;
-            const bool penultimate_was_timestamp = tokens_cur.size() < 2 || tokens_cur[tokens_cur.size() - 2].id >= vocab.token_beg;
-
-            //log("last_was_timestamp=%d penultimate_was_timestamp=%d\n", last_was_timestamp, penultimate_was_timestamp);
-
-            if (last_was_timestamp) {
-                if (penultimate_was_timestamp) {
-                    for (int i = vocab.token_beg; i < n_logits; ++i) {
-                        logits[i] = -INFINITY;
-                    }
-                } else {
-                    for (int i = 0; i < vocab.token_eot; ++i) {
-                        logits[i] = -INFINITY;
-                    }
-                }
-            }
-        }
-
-        // the initial timestamp cannot be larger than max_initial_ts
-        // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L426-L429
-        if (is_initial && params.max_initial_ts > 0.0f) {
-            const float precision = float(WHISPER_CHUNK_SIZE)/ctx.model.hparams.n_audio_ctx;
-            const int   tid0      = std::round(params.max_initial_ts/precision);
-
-            for (int i = vocab.token_beg + tid0 + 1; i < n_logits; ++i) {
-                logits[i] = -INFINITY;
-            }
-        }
-
-        // condition timestamp tokens to be increasing
-        // ref: https://github.com/openai/whisper/pull/831#issuecomment-1385910556
-        if (decoder.has_ts) {
-            const int tid0 = decoder.seek_delta/2;
-
-            for (int i = vocab.token_beg; i < vocab.token_beg + tid0; ++i) {
-                logits[i] = -INFINITY;
-            }
-        }
-
-        // populate the logprobs array (log_softmax)
-        {
-            const float logit_max = *std::max_element(logits.begin(), logits.end());
-            float logsumexp = 0.0f;
-            for (int i = 0; i < n_logits; ++i) {
-                if (logits[i] > -INFINITY) {
-                    logsumexp += expf(logits[i] - logit_max);
-                }
-            }
-            logsumexp = logf(logsumexp) + logit_max;
-
-            for (int i = 0; i < n_logits; ++i) {
-                if (logits[i] > -INFINITY) {
-                    logprobs[i] = logits[i] - logsumexp;
-                } else {
-                    logprobs[i] = -INFINITY;
-                }
-            }
-        }
-
-        // if sum of probability over timestamps is above any other token, sample timestamp
-        // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L431-L437
-        {
-            // logsumexp over timestamps
-            float timestamp_logprob = -INFINITY;
-            {
-                float logsumexp = 0.0f;
-                const float logprob_max = *std::max_element(logprobs.begin() + vocab.token_beg, logprobs.end());
-                for (int i = vocab.token_beg; i < n_logits; ++i) {
-                    if (logprobs[i] > -INFINITY) {
-                        logsumexp += expf(logprobs[i] - logprob_max);
-                    }
-                }
-                if (logsumexp > 0.0f) {
-                    timestamp_logprob = logf(logsumexp) + logprob_max;
-                }
-            }
-
-            const float max_text_token_logprob = *std::max_element(logprobs.begin(), logprobs.begin() + vocab.token_beg);
-
-            //log("timestamp_logprob=%f max_text_token_logprob=%f\n", timestamp_logprob, max_text_token_logprob);
-
-            if (timestamp_logprob > max_text_token_logprob) {
-                for (int i = 0; i < vocab.token_beg; ++i) {
-                    logits[i]   = -INFINITY;
-                    logprobs[i] = -INFINITY;
-                }
-            }
-        }
-    }
-
-    // compute probs
-    {
-        for (int i = 0; i < n_logits; ++i) {
-            if (logits[i] == -INFINITY) {
-                probs[i] = 0.0f;
-            } else {
-                probs[i] = expf(logprobs[i]);
-            }
-        }
-    }
-
-#if 0
-    // print first 100 logits - token string : logit
-    for (int i = 0; i < 100; i++) {
-        const auto token   = vocab.id_to_token.at(i);
-        const auto prob    = probs[i];
-        const auto logit   = logits[i];
-        const auto logprob = logprobs[i];
-        printf("%s : prob=%9.5f logit=%9.5f logprob=%9.5f\n", token.c_str(), prob, logit, logprob);
-    }
-
-    // "And", "and", " And", " and"
-    printf("logits[\"and\"]  = %f\n", logits[vocab.token_to_id.at("and")]);
-    printf("logits[\"And\"]  = %f\n", logits[vocab.token_to_id.at("And")]);
-    printf("logits[\" and\"] = %f\n", logits[vocab.token_to_id.at(" and")]);
-    printf("logits[\" And\"] = %f\n", logits[vocab.token_to_id.at(" And")]);
-    printf("logits[\" so\"]  = %f\n", logits[vocab.token_to_id.at(" so")]);
-
-    printf("logprobs[\"and\"]  = %f\n", logprobs[vocab.token_to_id.at("and")]);
-    printf("logprobs[\"And\"]  = %f\n", logprobs[vocab.token_to_id.at("And")]);
-    printf("logprobs[\" and\"] = %f\n", logprobs[vocab.token_to_id.at(" and")]);
-    printf("logprobs[\" And\"] = %f\n", logprobs[vocab.token_to_id.at(" And")]);
-    printf("logprobs[\" so\"]  = %f\n", logprobs[vocab.token_to_id.at(" so")]);
-
-    printf("probs[\"and\"]  = %f\n", probs[vocab.token_to_id.at("and")]);
-    printf("probs[\"And\"]  = %f\n", probs[vocab.token_to_id.at("And")]);
-    printf("probs[\" and\"] = %f\n", probs[vocab.token_to_id.at(" and")]);
-    printf("probs[\" And\"] = %f\n", probs[vocab.token_to_id.at(" And")]);
-    printf("probs[\" so\"]  = %f\n", probs[vocab.token_to_id.at(" so")]);
-#endif
-}
-
-static whisper_token_data whisper_sample_token(
-            whisper_context & ctx,
-              whisper_state & state,
-      const whisper_decoder & decoder,
-                       bool   best) {
-    whisper_token_data result = {
-        0, 0, 0.0f, 0.0f, 0.0f, 0.0f, -1, -1, 0.0f,
-    };
-
-    const auto & vocab = ctx.vocab;
-
-    const auto & probs    = decoder.probs;
-    const auto & logprobs = decoder.logprobs;
-
-    const int n_logits = vocab.n_vocab;
-
-    {
-        double sum_ts = 0.0;
-        double max_ts = 0.0;
-
-        for (int i = vocab.token_beg; i < n_logits; i++) {
-            if (probs[i] == -INFINITY) {
-                continue;
-            }
-
-            sum_ts += probs[i];
-            if (max_ts < probs[i]) {
-                max_ts = probs[i];
-                result.tid = i;
-            }
-        }
-
-        result.pt    = max_ts/(sum_ts + 1e-10);
-        result.ptsum = sum_ts;
-    }
-
-    if (best) {
-        for (int i = 0; i < n_logits; ++i) {
-            if (result.p < probs[i]) {
-                result.id   = i;
-                result.p    = probs[i];
-                result.plog = logprobs[i];
-            }
-        }
-    } else {
-        std::discrete_distribution<> dist(probs.begin(), probs.end());
-
-        result.id   = dist(state.rng);
-        result.p    = probs[result.id];
-        result.plog = logprobs[result.id];
-    }
-
-    if (result.id >= vocab.token_beg) {
-        result.tid = result.id;
-        result.pt  = result.p;
-    }
-
-    state.n_sample++;
-
-    return result;
-}
-
-static std::vector<whisper_token_data> whisper_sample_token_topk(
-            whisper_context & ctx,
-              whisper_state & state,
-      const whisper_decoder & decoder,
-                        int   k) {
-    const auto & vocab = ctx.vocab;
-
-    const auto & probs    = decoder.probs;
-    const auto & logits   = decoder.logits;
-    const auto & logprobs = decoder.logprobs;
-
-    const int n_logits = vocab.n_vocab;
-
-    auto & logits_id = state.logits_id;
-
-    logits_id.clear();
-    for (int i = 0; i < n_logits; ++i) {
-        logits_id.push_back({ logits[i], i });
-    }
-
-    std::partial_sort(
-            logits_id.begin(),
-            logits_id.begin() + k, logits_id.end(),
-            [](const std::pair<double, whisper_token> & a, const std::pair<double, whisper_token> & b) {
-                return a.first > b.first;
-            });
-
-    std::vector<whisper_token_data> result;
-    result.reserve(k);
-
-    whisper_token tid = vocab.token_beg;
-
-    float pt    = 0.0;
-    float ptsum = 0.0;
-
-    {
-        double sum_ts = 0.0;
-        double max_ts = 0.0;
-
-        for (int i = vocab.token_beg; i < n_logits; i++) {
-            if (probs[i] == -INFINITY) {
-                continue;
-            }
-
-            sum_ts += probs[i];
-            if (max_ts < probs[i]) {
-                max_ts = probs[i];
-                tid = i;
-            }
-        }
-
-        pt    = max_ts/(sum_ts + 1e-10);
-        ptsum = sum_ts;
-    }
-
-    for (int i = 0; i < k; ++i) {
-        const auto id = logits_id[i].second;
-
-        result.push_back({ id, tid, probs[id], logprobs[id], pt, ptsum, -1, -1, 0.0f, });
-
-        if (result[i].id >= vocab.token_beg) {
-            result[i].tid = result[i].id;
-            result[i].pt  = result[i].p;
-        }
-    }
-
-    state.n_sample++;
-
-    return result;
-}
-
-// ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L178-L192
-static void whisper_sequence_score(
-        const struct whisper_full_params & params,
-                        whisper_sequence & sequence) {
-    if (sequence.result_len == 0) {
-        return;
-    }
-
-    double result = 0.0f;
-
-    for (int i = 0; i < sequence.result_len; ++i) {
-        result += sequence.tokens[i].plog;
-    }
-
-    sequence.sum_logprobs = result;
-    sequence.avg_logprobs = result/sequence.result_len;
-
-    double penalty = sequence.result_len;
-
-    if (params.length_penalty > 0.0f) {
-        penalty = pow((5.0 + penalty)/6.0, params.length_penalty);
-    }
-
-    sequence.score = result/penalty;
-
-    // compute the entropy of the sequence of the last 32 tokens
-    {
-        const int n = 32;
-
-        int cnt = 0;
-        double entropy = 0.0f;
-
-        std::map<whisper_token, int> token_counts;
-        for (int i = std::max(0, sequence.result_len - n); i < sequence.result_len; ++i) {
-            token_counts[sequence.tokens[i].id]++;
-            cnt++;
-        }
-
-        for (const auto & kv : token_counts) {
-            const auto p = kv.second/(double)cnt;
-            entropy -= p*log(p);
-
-            //WHISPER_PRINT_DEBUG("entropy: %d %f %f, count %d\n", kv.first, p, log(p), kv.second);
-        }
-
-        sequence.entropy = entropy;
-    }
-}
-
-int whisper_full_with_state(
-        struct whisper_context * ctx,
-          struct whisper_state * state,
-    struct whisper_full_params   params,
-                   const float * samples,
-                           int   n_samples) {
-    // clear old results
-    auto & result_all = state->result_all;
-
-    result_all.clear();
-
-    if (n_samples > 0) {
-        // compute log mel spectrogram
-        if (params.speed_up) {
-            // TODO: Replace PV with more advanced algorithm
-            log("%s: failed to compute log mel spectrogram\n", __func__);
-            return -1;
-        } else {
-            if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
-                log("%s: failed to compute log mel spectrogram\n", __func__);
-                return -2;
-            }
-        }
-    }
-
-    // auto-detect language if not specified
-    if (params.language == nullptr || strlen(params.language) == 0 || strcmp(params.language, "auto") == 0 || params.detect_language) {
-        std::vector<float> probs(whisper_lang_max_id() + 1, 0.0f);
-
-        const auto lang_id = whisper_lang_auto_detect_with_state(ctx, state, 0, params.n_threads, probs.data());
-        if (lang_id < 0) {
-            log("%s: failed to auto-detect language\n", __func__);
-            return -3;
-        }
-        state->lang_id = lang_id;
-        params.language = whisper_lang_str(lang_id);
-
-        log("%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
-        if (params.detect_language) {
-            return 0;
-        }
-    }
-
-    if (params.token_timestamps) {
-        state->t_beg    = 0;
-        state->t_last   = 0;
-        state->tid_last = 0;
-        if (n_samples > 0) {
-            state->energy = get_signal_energy(samples, n_samples, 32);
-        }
-    }
-
-    const int seek_start = params.offset_ms/10;
-    const int seek_end = params.duration_ms == 0 ? whisper_n_len_from_state(state) : seek_start + params.duration_ms/10;
-
-    // if length of spectrogram is less than 1.0s (100 frames), then return
-    // basically don't process anything that is less than 1.0s
-    // see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
-    if (seek_end < seek_start + (params.speed_up ? 50 : 100)) {
-        return 0;
-    }
-
-    // a set of temperatures to use
-    // [ t0, t0 + delta, t0 + 2*delta, ..., < 1.0f + 1e-6f ]
-    std::vector<float> temperatures;
-    if (params.temperature_inc > 0.0f) {
-        for (float t = params.temperature; t < 1.0f + 1e-6f; t += params.temperature_inc) {
-            temperatures.push_back(t);
-        }
-    } else {
-        temperatures.push_back(params.temperature);
-    }
-
-    // initialize the decoders
-    int n_decoders = 1;
-
-    switch (params.strategy) {
-        case WHISPER_SAMPLING_GREEDY:
-            {
-                n_decoders = params.greedy.best_of;
-            } break;
-        case WHISPER_SAMPLING_BEAM_SEARCH:
-            {
-                n_decoders = std::max(params.greedy.best_of, params.beam_search.beam_size);
-            } break;
-    };
-
-    n_decoders = std::max(1, n_decoders);
-
-    // TAGS: WHISPER_DECODER_INIT
-    for (int j = 1; j < n_decoders; j++) {
-        auto & decoder = state->decoders[j];
-
-        if (decoder.kv_self.ctx == nullptr) {
-            decoder.kv_self = state->decoders[0].kv_self;
-            if (!kv_cache_reinit(decoder.kv_self)) {
-                log("%s: kv_cache_reinit() failed for self-attention, decoder %d\n", __func__, j);
-                return -4;
-            }
-
-            WHISPER_PRINT_DEBUG("%s: initialized self-attention kv cache, decoder %d\n", __func__, j);
-
-            decoder.sequence.tokens.reserve(state->decoders[0].sequence.tokens.capacity());
-
-            decoder.probs.resize   (ctx->vocab.n_vocab);
-            decoder.logits.resize  (ctx->vocab.n_vocab);
-            decoder.logprobs.resize(ctx->vocab.n_vocab);
-        }
-    }
-
-    // the accumulated text context so far
-    auto & prompt_past = state->prompt_past;
-    if (params.no_context) {
-        prompt_past.clear();
-    }
-
-    // prepare prompt
-    {
-        std::vector<whisper_token> prompt_tokens;
-
-        // initial prompt
-        if (!params.prompt_tokens && params.initial_prompt) {
-            prompt_tokens.resize(1024);
-            prompt_tokens.resize(whisper_tokenize(ctx, params.initial_prompt, prompt_tokens.data(), prompt_tokens.size()));
-            params.prompt_tokens   = prompt_tokens.data();
-            params.prompt_n_tokens = prompt_tokens.size();
-        }
-
-        // prepend the prompt tokens to the prompt_past
-        if (params.prompt_tokens && params.prompt_n_tokens > 0) {
-            // parse tokens from the pointer
-            for (int i = 0; i < params.prompt_n_tokens; i++) {
-                prompt_past.push_back(params.prompt_tokens[i]);
-            }
-            std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end());
-        }
-    }
-
-    // overwrite audio_ctx, max allowed is hparams.n_audio_ctx
-    if (params.audio_ctx > whisper_n_audio_ctx(ctx)) {
-        log("%s: audio_ctx is larger than the maximum allowed (%d > %d)\n", __func__, params.audio_ctx, whisper_n_audio_ctx(ctx));
-        return -5;
-    }
-    state->exp_n_audio_ctx = params.audio_ctx;
-
-    // these tokens determine the task that will be performed
-    std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx) };
-    if (whisper_is_multilingual(ctx)) {
-        const int lang_id = whisper_lang_id(params.language);
-        state->lang_id = lang_id;
-        prompt_init.push_back(whisper_token_lang(ctx, lang_id));
-        if (params.translate) {
-            prompt_init.push_back(whisper_token_translate(ctx));
-        } else {
-            prompt_init.push_back(whisper_token_transcribe(ctx));
-        }
-    }
-
-    int seek = seek_start;
-
-    std::vector<whisper_token> prompt;
-    prompt.reserve(whisper_n_text_ctx(ctx));
-
-    // beam-search helpers
-    struct kv_buf {
-        std::vector<uint8_t> k;
-        std::vector<uint8_t> v;
-    };
-
-    std::vector<kv_buf> kv_bufs;
-
-    struct beam_candidate {
-        int decoder_idx;
-        int seek_delta;
-
-        bool has_ts;
-
-        whisper_sequence sequence;
-    };
-
-    std::vector<beam_candidate> beam_candidates;
-
-    // main loop
-    while (true) {
-        if (params.progress_callback) {
-            const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
-
-            params.progress_callback(
-                ctx, ctx->state, progress_cur, params.progress_callback_user_data);
-        }
-
-        // of only 1 second left, then stop
-        if (seek + 100 >= seek_end) {
-            break;
-        }
-
-        if (params.encoder_begin_callback) {
-            if (params.encoder_begin_callback(ctx, state, params.encoder_begin_callback_user_data) == false) {
-                log("%s: encoder_begin_callback returned false - aborting\n", __func__);
-                break;
-            }
-        }
-
-        // encode audio features starting at offset seek
-        if (!whisper_encode_internal(*ctx, *state, seek, params.n_threads)) {
-            log("%s: failed to encode\n", __func__);
-            return -6;
-        }
-
-        // if there is a very short audio segment left to process, we remove any past prompt since it tends
-        // to confuse the decoder and often make it repeat or hallucinate stuff
-        if (seek > seek_start && seek + 500 >= seek_end) {
-            prompt_past.clear();
-        }
-
-        int best_decoder_id = 0;
-
-        for (int it = 0; it < (int) temperatures.size(); ++it) {
-            const float t_cur = temperatures[it];
-
-            int n_decoders_cur = 1;
-
-            switch (params.strategy) {
-                case whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY:
-                    {
-                        if (t_cur > 0.0f) {
-                            n_decoders_cur = params.greedy.best_of;
-                        }
-                    } break;
-                case whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH:
-                    {
-                        if (t_cur > 0.0f) {
-                            n_decoders_cur = params.greedy.best_of;
-                        } else {
-                            n_decoders_cur = params.beam_search.beam_size;
-                        }
-                    } break;
-            };
-
-            n_decoders_cur = std::max(1, n_decoders_cur);
-
-            WHISPER_PRINT_DEBUG("\n%s: decoding with %d decoders, temperature = %.2f\n", __func__, n_decoders_cur, t_cur);
-
-            // TAGS: WHISPER_DECODER_INIT
-            for (int j = 0; j < n_decoders_cur; ++j) {
-                auto & decoder = state->decoders[j];
-
-                decoder.kv_self.n = 0;
-
-                decoder.sequence.tokens.clear();
-                decoder.sequence.result_len       = 0;
-                decoder.sequence.sum_logprobs_all = 0.0;
-                decoder.sequence.sum_logprobs     = -INFINITY;
-                decoder.sequence.avg_logprobs     = -INFINITY;
-                decoder.sequence.entropy          = 0.0;
-                decoder.sequence.score            = -INFINITY;
-
-                decoder.seek_delta = 100*WHISPER_CHUNK_SIZE;
-
-                decoder.failed    = false;
-                decoder.completed = false;
-                decoder.has_ts    = false;
-            }
-
-            // init prompt and kv cache for the current iteration
-            // run whisper_decoder() only for decoder 0 and copy the results for the other decoders
-            {
-                prompt.clear();
-
-                // if we have already generated some text, use it as a prompt to condition the next generation
-                if (!prompt_past.empty() && t_cur < 0.5f && params.n_max_text_ctx > 0) {
-                    int n_take = std::min(std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2), int(prompt_past.size()));
-
-                    prompt = { whisper_token_prev(ctx) };
-                    prompt.insert(prompt.begin() + 1, prompt_past.end() - n_take, prompt_past.end());
-                }
-
-                // init new transcription with sot, language (opt) and task tokens
-                prompt.insert(prompt.end(), prompt_init.begin(), prompt_init.end());
-
-                // print the prompt
-                WHISPER_PRINT_DEBUG("\n\n");
-                for (int i = 0; i < (int) prompt.size(); i++) {
-                    WHISPER_PRINT_DEBUG("%s: prompt[%d] = %s\n", __func__, i, ctx->vocab.id_to_token.at(prompt[i]).c_str());
-                }
-                WHISPER_PRINT_DEBUG("\n\n");
-
-                if (!whisper_decode_internal(*ctx, *state, state->decoders[0], prompt.data(), prompt.size(), 0, params.n_threads)) {
-                    log("%s: failed to decode\n", __func__);
-                    return -7;
-                }
-
-                {
-                    const int64_t t_start_sample_us = ggml_time_us();
-
-                    whisper_process_logits(*ctx, *state, params, state->decoders[0], t_cur);
-
-                    state->decoders[0].kv_self.n += prompt.size();
-
-                    for (int j = 1; j < n_decoders_cur; ++j) {
-                        auto & decoder = state->decoders[j];
-
-                        memcpy(decoder.kv_self.k->data, state->decoders[0].kv_self.k->data, ggml_nbytes(decoder.kv_self.k));
-                        memcpy(decoder.kv_self.v->data, state->decoders[0].kv_self.v->data, ggml_nbytes(decoder.kv_self.v));
-
-                        decoder.kv_self.n += prompt.size();
-
-                        memcpy(decoder.probs.data(), state->decoders[0].probs.data(),    decoder.probs.size()*sizeof(decoder.probs[0]));
-                        memcpy(decoder.logits.data(), state->decoders[0].logits.data(),   decoder.logits.size()*sizeof(decoder.logits[0]));
-                        memcpy(decoder.logprobs.data(), state->decoders[0].logprobs.data(), decoder.logprobs.size()*sizeof(decoder.logprobs[0]));
-                    }
-
-                    state->t_sample_us += ggml_time_us() - t_start_sample_us;
-                }
-            }
-
-            for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                // store the KV caches of all decoders when doing beam-search
-                if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) {
-                    kv_bufs.resize(n_decoders_cur);
-                    for (int j = 0; j < n_decoders_cur; ++j) {
-                        auto & decoder = state->decoders[j];
-
-                        if (decoder.completed || decoder.failed) {
-                            continue;
-                        }
-
-                        kv_bufs[j].k.resize(ggml_nbytes(decoder.kv_self.k));
-                        kv_bufs[j].v.resize(ggml_nbytes(decoder.kv_self.v));
-
-                        memcpy(kv_bufs[j].k.data(), decoder.kv_self.k->data, kv_bufs[j].k.size());
-                        memcpy(kv_bufs[j].v.data(), decoder.kv_self.v->data, kv_bufs[j].v.size());
-                    }
-
-                    beam_candidates.clear();
-                }
-
-                // generate new sequence candidates for each decoder
-                for (int j = 0; j < n_decoders_cur; ++j) {
-                    auto & decoder = state->decoders[j];
-
-                    if (decoder.completed || decoder.failed) {
-                        continue;
-                    }
-
-                    switch (params.strategy) {
-                        case whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY:
-                            {
-                                if (t_cur < 1e-6f) {
-                                    decoder.sequence.tokens.push_back(whisper_sample_token(*ctx, *state, decoder, true));
-                                } else {
-                                    decoder.sequence.tokens.push_back(whisper_sample_token(*ctx, *state, decoder, false));
-                                }
-
-                                decoder.sequence.sum_logprobs_all += decoder.sequence.tokens.back().plog;
-                            } break;
-                        case whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH:
-                            {
-                                const auto tokens_new = whisper_sample_token_topk(*ctx, *state, decoder, params.beam_search.beam_size);
-
-                                for (const auto & token : tokens_new) {
-                                    beam_candidates.push_back({ j, decoder.seek_delta, decoder.has_ts, decoder.sequence });
-                                    beam_candidates.back().sequence.tokens.push_back(token);
-                                    beam_candidates.back().sequence.sum_logprobs_all += token.plog;
-
-                                    //WHISPER_PRINT_DEBUG("%s: beam candidate: %s (%f, %f)\n", __func__, ctx->vocab.id_to_token.at(token.id).c_str(), token.plog, beam_candidates.back().sequence.sum_logprobs_all);
-                                }
-                            } break;
-                    };
-                }
-
-                // for beam-search, choose the top candidates and update the KV caches
-                if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) {
-                    std::sort(
-                            beam_candidates.begin(),
-                            beam_candidates.end(),
-                            [](const beam_candidate & a, const beam_candidate & b) {
-                        return a.sequence.sum_logprobs_all > b.sequence.sum_logprobs_all;
-                    });
-
-                    uint32_t cur_c = 0;
-
-                    for (int j = 0; j < n_decoders_cur; ++j) {
-                        auto & decoder = state->decoders[j];
-
-                        if (decoder.completed || decoder.failed) {
-                            continue;
-                        }
-
-                        auto & cur = beam_candidates[cur_c++];
-
-                        while (beam_candidates.size() > cur_c && beam_candidates[cur_c].sequence.sum_logprobs_all == cur.sequence.sum_logprobs_all && i > 0) {
-                            ++cur_c;
-                        }
-
-                        decoder.sequence   = cur.sequence;
-                        decoder.seek_delta = cur.seek_delta;
-                        decoder.has_ts     = cur.has_ts;
-
-                        memcpy(decoder.kv_self.k->data, kv_bufs[cur.decoder_idx].k.data(), kv_bufs[cur.decoder_idx].k.size());
-                        memcpy(decoder.kv_self.v->data, kv_bufs[cur.decoder_idx].v.data(), kv_bufs[cur.decoder_idx].v.size());
-
-                        WHISPER_PRINT_DEBUG("%s: beam search: decoder %d: from decoder %d: token = %10s, plog = %8.5f, sum_logprobs = %8.5f\n",
-                                __func__, j, cur.decoder_idx, ctx->vocab.id_to_token.at(decoder.sequence.tokens.back().id).c_str(), decoder.sequence.tokens.back().plog, decoder.sequence.sum_logprobs_all);
-                    }
-                }
-
-                // update the decoder state
-                // - check if the sequence is completed
-                // - check if the sequence is failed
-                // - update sliding window based on timestamp tokens
-                for (int j = 0; j < n_decoders_cur; ++j) {
-                    auto & decoder = state->decoders[j];
-
-                    if (decoder.completed || decoder.failed) {
-                        continue;
-                    }
-
-                    auto & has_ts     = decoder.has_ts;
-                    auto & failed     = decoder.failed;
-                    auto & completed  = decoder.completed;
-                    auto & seek_delta = decoder.seek_delta;
-                    auto & result_len = decoder.sequence.result_len;
-
-                    {
-                        const auto & token = decoder.sequence.tokens.back();
-
-                        // timestamp token - update sliding window
-                        if (token.id > whisper_token_beg(ctx)) {
-                            const int seek_delta_new = 2*(token.id - whisper_token_beg(ctx));
-
-                            // do not allow to go back in time
-                            if (has_ts && seek_delta > seek_delta_new && result_len < i) {
-                                failed = true; // TODO: maybe this is not a failure ?
-                                continue;
-                            }
-
-                            seek_delta = seek_delta_new;
-                            result_len = i + 1;
-                            has_ts = true;
-                        }
-
-#ifdef WHISPER_DEBUG
-                        {
-                            const auto tt = token.pt > 0.10 ? ctx->vocab.id_to_token.at(token.tid) : "[?]";
-                            WHISPER_PRINT_DEBUG("%s: id = %3d, decoder = %d, token = %6d, p = %6.3f, ts = %10s, %6.3f, result_len = %4d '%s'\n",
-                                    __func__, i, j, token.id, token.p, tt.c_str(), token.pt, result_len, ctx->vocab.id_to_token.at(token.id).c_str());
-                        }
-#endif
-
-                        // end of segment
-                        if (token.id == whisper_token_eot(ctx) ||               // end of text token
-                           (params.max_tokens > 0 && i >= params.max_tokens) || // max tokens per segment reached
-                           (has_ts && seek + seek_delta + 100 >= seek_end)      // end of audio reached
-                           ) {
-                            if (result_len == 0) {
-                                if (seek + seek_delta + 100 >= seek_end) {
-                                    result_len = i + 1;
-                                } else {
-                                    failed = true;
-                                    continue;
-                                }
-                            }
-
-                            if (params.single_segment) {
-                                result_len = i + 1;
-                                seek_delta = 100*WHISPER_CHUNK_SIZE;
-                            }
-
-                            completed = true;
-                            continue;
-                        }
-
-                        // TESTS: if no tensors are loaded, it means we are running tests
-                        if (ctx->model.n_loaded == 0) {
-                            seek_delta = 100*WHISPER_CHUNK_SIZE;
-                            completed = true;
-                            continue;
-                        }
-                    }
-
-                    // sometimes, the decoding can get stuck in a repetition loop
-                    // this is an attempt to mitigate such cases - we flag the decoding as failed and use a fallback strategy
-                    if (i == n_max - 1 && (result_len == 0 || seek_delta < 100*WHISPER_CHUNK_SIZE/2)) {
-                        failed = true;
-                        continue;
-                    }
-                }
-
-                // check if all decoders have finished (i.e. completed or failed)
-                {
-                    bool completed_all = true;
-
-                    for (int j = 0; j < n_decoders_cur; ++j) {
-                        auto & decoder = state->decoders[j];
-
-                        if (decoder.completed || decoder.failed) {
-                            continue;
-                        }
-
-                        completed_all = false;
-                    }
-
-                    if (completed_all) {
-                        break;
-                    }
-                }
-
-                state->t_sample_us += ggml_time_us() - t_start_sample_us;
-
-                // obtain logits for the next token
-                for (int j = 0; j < n_decoders_cur; ++j) {
-                    auto & decoder = state->decoders[j];
-
-                    if (decoder.failed || decoder.completed) {
-                        continue;
-                    }
-
-                    decoder.tokens_tmp.resize(1);
-                    decoder.tokens_tmp[0] = decoder.sequence.tokens.back().id;
-
-                    //WHISPER_PRINT_DEBUG("%s: decoder %d: token %d, kv_self.n %d, seek_delta %d\n", __func__, j, decoder.tokens_tmp[0], decoder.kv_self.n, decoder.seek_delta);
-
-                    if (!whisper_decode_internal(*ctx, *state, decoder, decoder.tokens_tmp.data(), decoder.tokens_tmp.size(), decoder.kv_self.n, params.n_threads)) {
-                        log("%s: failed to decode\n", __func__);
-                        return -8;
-                    }
-
-                    {
-                        const int64_t t_start_sample_us = ggml_time_us();
-
-                        whisper_process_logits(*ctx, *state, params, decoder, t_cur);
-
-                        ++decoder.kv_self.n;
-
-                        state->t_sample_us += ggml_time_us() - t_start_sample_us;
-                    }
-                }
-            }
-
-            // rank the resulting sequences and select the best one
-            {
-                double best_score = -INFINITY;
-
-                for (int j = 0; j < n_decoders_cur; ++j) {
-                    auto & decoder = state->decoders[j];
-
-                    if (decoder.failed) {
-                        continue;
-                    }
-
-                    decoder.sequence.tokens.resize(decoder.sequence.result_len);
-                    whisper_sequence_score(params, decoder.sequence);
-
-                    WHISPER_PRINT_DEBUG("%s: decoder %2d: score = %8.5f, result_len = %3d, avg_logprobs = %8.5f, entropy = %8.5f\n",
-                            __func__, j, decoder.sequence.score, decoder.sequence.result_len, decoder.sequence.avg_logprobs, decoder.sequence.entropy);
-
-                    if (decoder.sequence.result_len > 32 && decoder.sequence.entropy < params.entropy_thold) {
-                        WHISPER_PRINT_DEBUG("%s: decoder %2d: failed due to entropy %8.5f < %8.5f\n",
-                                __func__, j, decoder.sequence.entropy, params.entropy_thold);
-
-                        decoder.failed = true;
-                        state->n_fail_h++;
-
-                        continue;
-                    }
-
-                    if (best_score < decoder.sequence.score) {
-                        best_score = decoder.sequence.score;
-                        best_decoder_id = j;
-                    }
-                }
-
-                WHISPER_PRINT_DEBUG("%s: best decoder = %d\n", __func__, best_decoder_id);
-            }
-
-            // was the decoding successful for the current temperature?
-            // do fallback only if:
-            // - we are not at the last temperature
-            // - we are not at the end of the audio (3 sec)
-            if (it != (int) temperatures.size() - 1 &&
-                seek_end - seek > 10*WHISPER_CHUNK_SIZE) {
-                bool success = true;
-
-                const auto & decoder = state->decoders[best_decoder_id];
-
-                if (decoder.failed || decoder.sequence.avg_logprobs < params.logprob_thold) {
-                    success = false;
-                    state->n_fail_p++;
-                }
-
-                if (success) {
-                    //for (auto & token : ctx->decoders[best_decoder_id].sequence.tokens) {
-                    //    WHISPER_PRINT_DEBUG("%s: token = %d, p = %6.3f, pt = %6.3f, ts = %s, str = %s\n", __func__, token.id, token.p, token.pt, ctx->vocab.id_to_token.at(token.tid).c_str(), ctx->vocab.id_to_token.at(token.id).c_str());
-                    //}
-
-                    break;
-                }
-            }
-
-            WHISPER_PRINT_DEBUG("\n%s: failed to decode with temperature = %.2f\n", __func__, t_cur);
-        }
-
-        // output results through a user-provided callback
-        {
-            const auto & best_decoder = state->decoders[best_decoder_id];
-
-            const auto seek_delta = best_decoder.seek_delta;
-            const auto result_len = best_decoder.sequence.result_len;
-
-            const auto & tokens_cur = best_decoder.sequence.tokens;
-
-            //WHISPER_PRINT_DEBUG("prompt_init.size() = %d, prompt.size() = %d, result_len = %d, seek_delta = %d\n", prompt_init.size(), prompt.size(), result_len, seek_delta);
-
-            // update prompt_past
-            prompt_past.clear();
-            if (prompt.front() == whisper_token_prev(ctx)) {
-                prompt_past.insert(prompt_past.end(), prompt.begin() + 1, prompt.end() - prompt_init.size());
-            }
-
-            for (int i = 0; i < result_len; ++i) {
-                prompt_past.push_back(tokens_cur[i].id);
-            }
-
-            if (!tokens_cur.empty() && ctx->model.n_loaded > 0) {
-                int  i0 = 0;
-                auto t0 = seek + 2*(tokens_cur.front().tid - whisper_token_beg(ctx));
-
-                std::string text;
-                bool speaker_turn_next = false;
-
-                for (int i = 0; i < (int) tokens_cur.size(); i++) {
-                    //printf("%s: %18s %6.3f %18s %6.3f\n", __func__,
-                    //        ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].p,
-                    //        ctx->vocab.id_to_token[tokens_cur[i].tid].c_str(), tokens_cur[i].pt);
-
-                    if (params.print_special || tokens_cur[i].id < whisper_token_eot(ctx)) {
-                        text += whisper_token_to_str(ctx, tokens_cur[i].id);
-                    }
-
-                    // [TDRZ] record if speaker turn was predicted after current segment
-                    if (params.tdrz_enable && tokens_cur[i].id == whisper_token_solm(ctx)) {
-                        speaker_turn_next = true;
-                    }
-
-                    if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) {
-                        const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
-
-                        if (!text.empty()) {
-                            const auto tt0 = params.speed_up ? 2*t0 : t0;
-                            const auto tt1 = params.speed_up ? 2*t1 : t1;
-
-                            if (params.print_realtime) {
-                                if (params.print_timestamps) {
-                                    printf("[%s --> %s]  %s\n", to_timestamp(tt0).c_str(), to_timestamp(tt1).c_str(), text.c_str());
-                                } else {
-                                    printf("%s", text.c_str());
-                                    fflush(stdout);
-                                }
-                            }
-
-                            //printf("tt0 = %d, tt1 = %d, text = %s, token = %s, token_id = %d, tid = %d\n", tt0, tt1, text.c_str(), ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].id, tokens_cur[i].tid);
-
-                            result_all.push_back({ tt0, tt1, text, {}, speaker_turn_next });
-                            for (int j = i0; j <= i; j++) {
-                                result_all.back().tokens.push_back(tokens_cur[j]);
-                            }
-
-                            int n_new = 1;
-
-                            if (params.token_timestamps) {
-                                whisper_exp_compute_token_level_timestamps(
-                                        *ctx, *state, result_all.size() - 1, params.thold_pt, params.thold_ptsum);
-
-                                if (params.max_len > 0) {
-                                    n_new = whisper_wrap_segment(*ctx, *state, params.max_len, params.split_on_word);
-                                }
-                            }
-                            if (params.new_segment_callback) {
-                                params.new_segment_callback(ctx, state, n_new, params.new_segment_callback_user_data);
-                            }
-                        }
-                        text = "";
-                        while (i < (int) tokens_cur.size() && tokens_cur[i].id > whisper_token_beg(ctx)) {
-                            i++;
-                        }
-                        i--;
-                        t0 = t1;
-                        i0 = i + 1;
-                        speaker_turn_next = false;
-                    }
-                }
-
-                if (!text.empty()) {
-                    const auto t1 = seek + seek_delta;
-
-                    const auto tt0 = params.speed_up ? 2*t0 : t0;
-                    const auto tt1 = params.speed_up ? 2*t1 : t1;
-
-                    if (params.print_realtime) {
-                        if (params.print_timestamps) {
-                            printf("[%s --> %s]  %s\n", to_timestamp(tt0).c_str(), to_timestamp(tt1).c_str(), text.c_str());
-                        } else {
-                            printf("%s", text.c_str());
-                            fflush(stdout);
-                        }
-                    }
-
-                    result_all.push_back({ tt0, tt1, text, {} , speaker_turn_next });
-                    for (int j = i0; j < (int) tokens_cur.size(); j++) {
-                        result_all.back().tokens.push_back(tokens_cur[j]);
-                    }
-
-                    int n_new = 1;
-
-                    if (params.token_timestamps) {
-                        whisper_exp_compute_token_level_timestamps(
-                                *ctx, *state, result_all.size() - 1, params.thold_pt, params.thold_ptsum);
-
-                        if (params.max_len > 0) {
-                            n_new = whisper_wrap_segment(*ctx, *state, params.max_len, params.split_on_word);
-                        }
-                    }
-                    if (params.new_segment_callback) {
-                        params.new_segment_callback(ctx, state, n_new, params.new_segment_callback_user_data);
-                    }
-                }
-            }
-
-            // update audio window
-            seek += seek_delta;
-
-            WHISPER_PRINT_DEBUG("seek = %d, seek_delta = %d\n", seek, seek_delta);
-        }
-    }
-
-    return 0;
-}
-
-int whisper_full(
-        struct whisper_context * ctx,
-    struct whisper_full_params   params,
-                   const float * samples,
-                           int   n_samples) {
-    return whisper_full_with_state(ctx, ctx->state, params, samples, n_samples);
-}
-
-int whisper_full_parallel(
-        struct whisper_context * ctx,
-        struct whisper_full_params params,
-        const float * samples,
-        int n_samples,
-        int n_processors) {
-    if (n_processors == 1) {
-        return whisper_full(ctx, params, samples, n_samples);
-    }
-    int ret = 0;
-
-    // prepare separate states for each thread
-    std::vector<whisper_state*> states;
-
-    const int offset_samples = (WHISPER_SAMPLE_RATE*params.offset_ms)/1000;
-    const int n_samples_per_processor = (n_samples - offset_samples)/n_processors;
-
-    // the calling thread will process the first chunk
-    // while the other threads will process the remaining chunks
-
-    std::vector<std::thread> workers(n_processors - 1);
-    for (int i = 0; i < n_processors - 1; ++i) {
-        // create a new state for each thread
-        states.push_back(whisper_init_state(ctx));
-
-        const int start_samples = offset_samples + (i + 1)*n_samples_per_processor;
-        const int n_samples_cur = (i == n_processors - 2) ? n_samples - start_samples : n_samples_per_processor;
-
-        auto params_cur = params;
-
-        params_cur.offset_ms = 0;
-        params_cur.print_progress = false;
-        params_cur.print_realtime = false;
-
-        params_cur.new_segment_callback = nullptr;
-        params_cur.new_segment_callback_user_data = nullptr;
-
-        params_cur.progress_callback = nullptr;
-        params_cur.progress_callback_user_data = nullptr;
-
-        workers[i] = std::thread(whisper_full_with_state, ctx, states[i], std::move(params_cur), samples + start_samples, n_samples_cur);
-    }
-
-    {
-        auto params_cur = params;
-
-        // We need to disable the print real-time for this one as well, otherwise it will show only for the first chunk.
-        params_cur.print_realtime = false;
-
-        // Run the first transformation using default state but only for the first chunk.
-        ret = whisper_full_with_state(ctx, ctx->state, std::move(params_cur), samples, offset_samples + n_samples_per_processor);
-    }
-
-    for (int i = 0; i < n_processors - 1; ++i) {
-        workers[i].join();
-    }
-
-    const int64_t offset_t = (int64_t) params.offset_ms/10.0;
-
-    // combine results into result_state->result_all from all other states
-    for (int i = 0; i < n_processors - 1; ++i) {
-        auto& results_i = states[i]->result_all;
-
-        for (auto& result : results_i) {
-            // correct the segment timestamp taking into account the offset
-            result.t0 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t;
-            result.t1 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t;
-
-            // make sure that segments are not overlapping
-            if (!ctx->state->result_all.empty()) {
-                result.t0 = std::max(result.t0, ctx->state->result_all.back().t1);
-            }
-
-            ctx->state->result_all.push_back(std::move(result));
-
-            // call the new_segment_callback for each segment
-            if (params.new_segment_callback) {
-                params.new_segment_callback(ctx, ctx->state, 1, params.new_segment_callback_user_data);
-            }
-        }
-
-        ctx->state->t_mel_us += states[i]->t_mel_us;
-
-        ctx->state->t_sample_us += states[i]->t_sample_us;
-        ctx->state->t_encode_us += states[i]->t_encode_us;
-        ctx->state->t_decode_us += states[i]->t_decode_us;
-
-        whisper_free_state(states[i]);
-    }
-
-    // average the timings
-    ctx->state->t_mel_us    /= n_processors;
-    ctx->state->t_sample_us /= n_processors;
-    ctx->state->t_encode_us /= n_processors;
-    ctx->state->t_decode_us /= n_processors;
-
-    // print information about the audio boundaries
-    log("\n");
-    log("%s: the audio has been split into %d chunks at the following times:\n", __func__, n_processors);
-    for (int i = 0; i < n_processors - 1; ++i) {
-        log("%s: split %d - %s\n", __func__, (i + 1), to_timestamp(100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t).c_str());
-    }
-    log("%s: the transcription quality may be degraded near these boundaries\n", __func__);
-
-    return ret;
-}
-
-int whisper_full_n_segments_from_state(struct whisper_state * state) {
-    return state->result_all.size();
-}
-
-int whisper_full_n_segments(struct whisper_context * ctx) {
-    return ctx->state->result_all.size();
-}
-
-int whisper_full_lang_id_from_state(struct whisper_state * state) {
-    return state->lang_id;
-}
-
-int whisper_full_lang_id(struct whisper_context * ctx) {
-    return ctx->state->lang_id;
-}
-
-int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment) {
-    return state->result_all[i_segment].t0;
-}
-
-int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
-    return ctx->state->result_all[i_segment].t0;
-}
-
-int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment) {
-    return state->result_all[i_segment].t1;
-}
-
-int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment) {
-    return ctx->state->result_all[i_segment].t1;
-}
-
-bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment) {
-    return ctx->state->result_all[i_segment].speaker_turn_next;
-}
-
-const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment) {
-    return state->result_all[i_segment].text.c_str();
-}
-
-const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment) {
-    return ctx->state->result_all[i_segment].text.c_str();
-}
-
-int whisper_full_n_tokens_from_state(struct whisper_state * state, int i_segment) {
-    return state->result_all[i_segment].tokens.size();
-}
-
-int whisper_full_n_tokens(struct whisper_context * ctx, int i_segment) {
-    return ctx->state->result_all[i_segment].tokens.size();
-}
-
-const char * whisper_full_get_token_text_from_state(struct whisper_context * ctx, struct whisper_state * state, int i_segment, int i_token) {
-    return ctx->vocab.id_to_token[state->result_all[i_segment].tokens[i_token].id].c_str();
-}
-
-const char* whisper_full_get_token_text(struct whisper_context * ctx, int i_segment, int i_token) {
-    return ctx->vocab.id_to_token[ctx->state->result_all[i_segment].tokens[i_token].id].c_str();
-}
-
-whisper_token whisper_full_get_token_id_from_state(struct whisper_state * state, int i_segment, int i_token) {
-    return state->result_all[i_segment].tokens[i_token].id;
-}
-
-whisper_token whisper_full_get_token_id(struct whisper_context * ctx, int i_segment, int i_token) {
-    return ctx->state->result_all[i_segment].tokens[i_token].id;
-}
-
-struct whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token) {
-    return state->result_all[i_segment].tokens[i_token];
-}
-
-struct whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token) {
-    return ctx->state->result_all[i_segment].tokens[i_token];
-}
-
-float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token) {
-    return state->result_all[i_segment].tokens[i_token].p;
-}
-
-float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token) {
-    return ctx->state->result_all[i_segment].tokens[i_token].p;
-}
-
-// =================================================================================================
-
-//
-// Temporary interface needed for exposing ggml interface
-// Will be removed in the future when ggml becomes a separate library
-//
-
-WHISPER_API int whisper_bench_memcpy(int n_threads) {
-    fputs(whisper_bench_memcpy_str(n_threads), stderr);
-    return 0;
-}
-
-WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
-    static std::string s;
-    s = "";
-    char strbuf[256];
-
-    ggml_time_init();
-
-    size_t n    = 20;
-    size_t arr  = n_threads > 0 ? 1024llu : n_threads; // trick to avoid compiler optimizations
-
-    // 1GB MB array
-    const size_t size = arr*1024llu*1024llu;
-
-    // single-thread
-    {
-        char * src = (char *) malloc(size);
-        char * dst = (char *) malloc(size);
-
-        for (size_t i = 0; i < size; i++) src[i] = i;
-
-        memcpy(dst, src, size); // heat-up
-
-        double tsum = 0.0;
-        double sum  = 0.0;
-
-        for (size_t i = 0; i < n; i++) {
-            const int64_t t0 = ggml_time_us();
-
-            memcpy(dst, src, size);
-
-            const int64_t t1 = ggml_time_us();
-
-            tsum += (t1 - t0)*1e-6;
-
-            src[rand() % size] = rand() % 256;
-        }
-
-        snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s (1 thread)\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
-        s += strbuf;
-
-        // needed to prevent the compiler from optimizing the memcpy away
-        {
-            for (size_t i = 0; i < size; i++) sum += dst[i];
-
-            snprintf(strbuf, sizeof(strbuf), "sum:    %f\n", sum);
-            s += strbuf;
-        }
-
-        free(src);
-        free(dst);
-    }
-
-    return s.c_str();
-}
-
-WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
-    fputs(whisper_bench_ggml_mul_mat_str(n_threads), stderr);
-    return 0;
-}
-
-WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
-    static std::string s;
-    s = "";
-    char strbuf[256];
-
-    ggml_time_init();
-
-    const int n_max = 128;
-
-    const std::vector<size_t> sizes = {
-        64, 128, 256, 512, 1024, 2048, 4096,
-    };
-
-    const size_t N_max = sizes.back();
-
-    // a: N*N*sizeof(float)
-    // b: N*N*sizeof(float)
-    // c: N*N*sizeof(float)
-    // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
-    std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*512);
-
-    // put a bunch of random data in the buffer
-    for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
-
-    for (int j = 0; j < (int) sizes.size(); j++) {
-        int n_q4_0 = 0;
-        int n_q4_1 = 0;
-        int n_q5_0 = 0;
-        int n_q5_1 = 0;
-        int n_q8_0 = 0;
-        int n_fp16 = 0;
-        int n_fp32 = 0;
-
-        // GFLOPS/s
-        double s_q4_0 = 0.0;
-        double s_q4_1 = 0.0;
-        double s_q5_0 = 0.0;
-        double s_q5_1 = 0.0;
-        double s_q8_0 = 0.0;
-        double s_fp16 = 0.0;
-        double s_fp32 = 0.0;
-
-        const size_t N = sizes[j];
-
-        for (int k = 0; k < 7; ++k) {
-            const ggml_type wtype =
-                k == 0 ? GGML_TYPE_Q4_0 :
-                k == 1 ? GGML_TYPE_Q4_1 :
-                k == 2 ? GGML_TYPE_Q5_0 :
-                k == 3 ? GGML_TYPE_Q5_1 :
-                k == 4 ? GGML_TYPE_Q8_0 :
-                k == 5 ? GGML_TYPE_F16  : GGML_TYPE_F32;
-
-            double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q5_0 : k == 3 ? s_q5_1 : k == 4 ? s_q8_0 : k == 5 ? s_fp16 : /*k == 6*/ s_fp32;
-            int    & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q5_0 : k == 3 ? n_q5_1 : k == 4 ? n_q8_0 : k == 5 ? n_fp16 : /*k == 6*/ n_fp32;
-
-            struct ggml_init_params gparams = {
-                /*.mem_size   =*/ buf.size(),
-                /*.mem_buffer =*/ buf.data(),
-                /*.no_alloc   =*/ false,
-            };
-
-            struct ggml_context * ctx0 = ggml_init(gparams);
-
-            struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype,         N, N);
-            struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N);
-
-            struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b);
-
-            struct ggml_cgraph gf = ggml_build_forward(c);
-
-            double tsum = 0.0;
-
-            // heat-up
-            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-
-            for (int i = 0; i < n_max; ++i) {
-                const int64_t t0 = ggml_time_us();
-
-                ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-
-                const int64_t t1 = ggml_time_us();
-
-                tsum += (t1 - t0)*1e-6;
-                n++;
-
-                if (tsum > 1.0 && n >= 3) {
-                    break;
-                }
-            }
-
-            ggml_free(ctx0);
-
-            s = ((2.0*N*N*N*n)/tsum)*1e-9;
-        }
-
-        // Q4_0 | Q4_1
-        snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs)\n",
-                N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1);
-        s += strbuf;
-
-        // Q5_0 | Q5_1 | Q8_0
-        snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q5_0 %7.1f GFLOPS (%3d runs) | Q5_1 %7.1f GFLOPS (%3d runs) | Q8_0 %7.1f GFLOPS (%3d runs)\n",
-                N, N, s_q5_0, n_q5_0, s_q5_1, n_q5_1, s_q8_0, n_q8_0);
-        s += strbuf;
-
-        // F16 | F32
-        snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: F16  %7.1f GFLOPS (%3d runs) | F32  %7.1f GFLOPS (%3d runs)\n",
-                N, N, s_fp16, n_fp16, s_fp32, n_fp32);
-        s += strbuf;
-    }
-
-    return s.c_str();
-}
-
-// =================================================================================================
-
-// =================================================================================================
-
-//
-// Experimental stuff below
-//
-// Not sure if these should be part of the library at all, because the quality of the results is not
-// guaranteed. Might get removed at some point unless a robust algorithm implementation is found
-//
-
-// =================================================================================================
-
-//
-// token-level timestamps
-//
-
-static int timestamp_to_sample(int64_t t, int n_samples) {
-    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
-}
-
-static int64_t sample_to_timestamp(int i_sample) {
-    return (100ll*i_sample)/WHISPER_SAMPLE_RATE;
-}
-
-// a cost-function / heuristic that is high for text that takes longer to pronounce
-// obviously, can be improved
-static float voice_length(const std::string & text) {
-    float res = 0.0f;
-
-    for (char c : text) {
-        if (c == ' ') {
-            res += 0.01f;
-        } else if (c == ',') {
-            res += 2.00f;
-        } else if (c == '.') {
-            res += 3.00f;
-        } else if (c == '!') {
-            res += 3.00f;
-        } else if (c == '?') {
-            res += 3.00f;
-        } else if (c >= '0' && c <= '9') {
-            res += 3.00f;
-        } else {
-            res += 1.00f;
-        }
-    }
-
-    return res;
-}
-
-// average the fabs of the signal
-static std::vector<float> get_signal_energy(const float * signal, int n_samples, int n_samples_per_half_window) {
-    const int hw = n_samples_per_half_window;
-
-    std::vector<float> result(n_samples);
-
-    for (int i = 0; i < n_samples; i++) {
-        float sum = 0;
-        for (int j = -hw; j <= hw; j++) {
-            if (i + j >= 0 && i + j < n_samples) {
-                sum += fabs(signal[i + j]);
-            }
-        }
-        result[i] = sum/(2*hw + 1);
-    }
-
-    return result;
-}
-
-static void whisper_exp_compute_token_level_timestamps(
-        struct whisper_context & ctx,
-          struct whisper_state & state,
-                           int   i_segment,
-                         float   thold_pt,
-                         float   thold_ptsum) {
-    auto & segment = state.result_all[i_segment];
-    auto & tokens  = segment.tokens;
-
-    const int n_samples = state.energy.size();
-
-    if (n_samples == 0) {
-        log("%s: no signal data available\n", __func__);
-        return;
-    }
-
-    const int64_t t0 = segment.t0;
-    const int64_t t1 = segment.t1;
-
-    const int n = tokens.size();
-
-    if (n == 0) {
-        return;
-    }
-
-    if (n == 1) {
-        tokens[0].t0 = t0;
-        tokens[0].t1 = t1;
-
-        return;
-    }
-
-    auto & t_beg    = state.t_beg;
-    auto & t_last   = state.t_last;
-    auto & tid_last = state.tid_last;
-
-    for (int j = 0; j < n; ++j) {
-        auto & token = tokens[j];
-
-        if (j == 0) {
-            if (token.id == whisper_token_beg(&ctx)) {
-                tokens[j    ].t0 = t0;
-                tokens[j    ].t1 = t0;
-                tokens[j + 1].t0 = t0;
-
-                t_beg    = t0;
-                t_last   = t0;
-                tid_last = whisper_token_beg(&ctx);
-            } else {
-                tokens[j    ].t0 = t_last;
-            }
-        }
-
-        const int64_t tt = t_beg + 2*(token.tid - whisper_token_beg(&ctx));
-
-        tokens[j].id    = token.id;
-        tokens[j].tid   = token.tid;
-        tokens[j].p     = token.p;
-        tokens[j].pt    = token.pt;
-        tokens[j].ptsum = token.ptsum;
-
-        tokens[j].vlen = voice_length(whisper_token_to_str(&ctx, token.id));
-
-        if (token.pt > thold_pt && token.ptsum > thold_ptsum && token.tid > tid_last && tt <= t1) {
-            if (j > 0) {
-                tokens[j - 1].t1 = tt;
-            }
-            tokens[j].t0 = tt;
-            tid_last = token.tid;
-        }
-    }
-
-    tokens[n - 2].t1 = t1;
-    tokens[n - 1].t0 = t1;
-    tokens[n - 1].t1 = t1;
-
-    t_last = t1;
-
-    // find intervals of tokens with unknown timestamps
-    // fill the timestamps by proportionally splitting the interval based on the token voice lengths
-    {
-        int p0 = 0;
-        int p1 = 0;
-
-        while (true) {
-            while (p1 < n && tokens[p1].t1 < 0) {
-                p1++;
-            }
-
-            if (p1 >= n) {
-                p1--;
-            }
-
-            //printf("p0=%d p1=%d t0=%lld t1=%lld\n", p0, p1, tokens[p0].t0, tokens[p1].t1);
-
-            if (p1 > p0) {
-                double psum = 0.0;
-                for (int j = p0; j <= p1; j++) {
-                    psum += tokens[j].vlen;
-                }
-
-                //printf("analyzing %d - %d, psum = %f\n", p0, p1, psum);
-
-                const double dt = tokens[p1].t1 - tokens[p0].t0;
-
-                // split the time proportionally to the voice length
-                for (int j = p0 + 1; j <= p1; j++) {
-                    const double ct = tokens[j - 1].t0 + dt*tokens[j - 1].vlen/psum;
-
-                    tokens[j - 1].t1 = ct;
-                    tokens[j    ].t0 = ct;
-                }
-            }
-
-            p1++;
-            p0 = p1;
-            if (p1 >= n) {
-                break;
-            }
-        }
-    }
-
-    // fix up (just in case)
-    for (int j = 0; j < n - 1; j++) {
-        if (tokens[j].t1 < 0) {
-            tokens[j + 1].t0 = tokens[j].t1;
-        }
-
-        if (j > 0) {
-            if (tokens[j - 1].t1 > tokens[j].t0) {
-                tokens[j].t0 = tokens[j - 1].t1;
-                tokens[j].t1 = std::max(tokens[j].t0, tokens[j].t1);
-            }
-        }
-    }
-
-    // VAD
-    // expand or contract tokens based on voice activity
-    {
-        const int hw = WHISPER_SAMPLE_RATE/8;
-
-        for (int j = 0; j < n; j++) {
-            if (tokens[j].id >= whisper_token_eot(&ctx)) {
-                continue;
-            }
-
-            int s0 = timestamp_to_sample(tokens[j].t0, n_samples);
-            int s1 = timestamp_to_sample(tokens[j].t1, n_samples);
-
-            const int ss0 = std::max(s0 - hw, 0);
-            const int ss1 = std::min(s1 + hw, n_samples);
-
-            const int ns = ss1 - ss0;
-
-            float sum = 0.0f;
-
-            for (int k = ss0; k < ss1; k++) {
-                sum += state.energy[k];
-            }
-
-            const float thold = 0.5*sum/ns;
-
-            {
-                int k = s0;
-                if (state.energy[k] > thold && j > 0) {
-                    while (k > 0 && state.energy[k] > thold) {
-                        k--;
-                    }
-                    tokens[j].t0 = sample_to_timestamp(k);
-                    if (tokens[j].t0 < tokens[j - 1].t1) {
-                        tokens[j].t0 = tokens[j - 1].t1;
-                    } else {
-                        s0 = k;
-                    }
-                } else {
-                    while (state.energy[k] < thold && k < s1) {
-                        k++;
-                    }
-                    s0 = k;
-                    tokens[j].t0 = sample_to_timestamp(k);
-                }
-            }
-
-            {
-                int k = s1;
-                if (state.energy[k] > thold) {
-                    while (k < n_samples - 1 && state.energy[k] > thold) {
-                        k++;
-                    }
-                    tokens[j].t1 = sample_to_timestamp(k);
-                    if (j < ns - 1 && tokens[j].t1 > tokens[j + 1].t0) {
-                        tokens[j].t1 = tokens[j + 1].t0;
-                    } else {
-                        s1 = k;
-                    }
-                } else {
-                    while (state.energy[k] < thold && k > s0) {
-                        k--;
-                    }
-                    s1 = k;
-                    tokens[j].t1 = sample_to_timestamp(k);
-                }
-            }
-        }
-    }
-
-    // fixed token expand (optional)
-    //{
-    //    const int t_expand = 0;
-
-    //    for (int j = 0; j < n; j++) {
-    //        if (j > 0) {
-    //            tokens[j].t0 = std::max(0, (int) (tokens[j].t0 - t_expand));
-    //        }
-    //        if (j < n - 1) {
-    //            tokens[j].t1 = tokens[j].t1 + t_expand;
-    //        }
-    //    }
-    //}
-
-    // debug info
-    //for (int j = 0; j < n; ++j) {
-    //    const auto & token = tokens[j];
-    //    const auto tt = token.pt > thold_pt && token.ptsum > 0.01 ? whisper_token_to_str(&ctx, token.tid) : "[?]";
-    //    printf("%s: %10s %6.3f %6.3f %6.3f %6.3f %5d %5d '%s'\n", __func__,
-    //            tt, token.p, token.pt, token.ptsum, token.vlen, (int) token.t0, (int) token.t1, whisper_token_to_str(&ctx, token.id));
-
-    //    if (tokens[j].id >= whisper_token_eot(&ctx)) {
-    //        continue;
-    //    }
-    //}
-}
-
-void whisper_set_log_callback(whisper_log_callback callback) {
-    whisper_log = callback;
-}

+ 0 - 531
ggml/examples/whisper/whisper.h

@@ -1,531 +0,0 @@
-#ifndef WHISPER_H
-#define WHISPER_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdbool.h>
-
-#ifdef WHISPER_SHARED
-#    ifdef _WIN32
-#        ifdef WHISPER_BUILD
-#            define WHISPER_API __declspec(dllexport)
-#        else
-#            define WHISPER_API __declspec(dllimport)
-#        endif
-#    else
-#        define WHISPER_API __attribute__ ((visibility ("default")))
-#    endif
-#else
-#    define WHISPER_API
-#endif
-
-#define WHISPER_SAMPLE_RATE 16000
-#define WHISPER_N_FFT       400
-#define WHISPER_N_MEL       80
-#define WHISPER_HOP_LENGTH  160
-#define WHISPER_CHUNK_SIZE  30
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-    //
-    // C interface
-    //
-    // The following interface is thread-safe as long as the sample whisper_context is not used by multiple threads
-    // concurrently.
-    //
-    // Basic usage:
-    //
-    //     #include "whisper.h"
-    //
-    //     ...
-    //
-    //     struct whisper_context * ctx = whisper_init_from_file("/path/to/ggml-base.en.bin");
-    //
-    //     if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
-    //         fprintf(stderr, "failed to process audio\n");
-    //         return 7;
-    //     }
-    //
-    //     const int n_segments = whisper_full_n_segments(ctx);
-    //     for (int i = 0; i < n_segments; ++i) {
-    //         const char * text = whisper_full_get_segment_text(ctx, i);
-    //         printf("%s", text);
-    //     }
-    //
-    //     whisper_free(ctx);
-    //
-    //     ...
-    //
-    // This is a demonstration of the most straightforward usage of the library.
-    // "pcmf32" contains the RAW audio data in 32-bit floating point format.
-    //
-    // The interface also allows for more fine-grained control over the computation, but it requires a deeper
-    // understanding of how the model works.
-    //
-
-    struct whisper_context;
-    struct whisper_state;
-    struct whisper_full_params;
-
-    typedef int whisper_token;
-
-    typedef struct whisper_token_data {
-        whisper_token id;  // token id
-        whisper_token tid; // forced timestamp token id
-
-        float p;           // probability of the token
-        float plog;        // log probability of the token
-        float pt;          // probability of the timestamp token
-        float ptsum;       // sum of probabilities of all timestamp tokens
-
-        // token-level timestamp data
-        // do not use if you haven't computed token-level timestamps
-        int64_t t0;        // start time of the token
-        int64_t t1;        //   end time of the token
-
-        float vlen;        // voice length of the token
-    } whisper_token_data;
-
-    typedef struct whisper_model_loader {
-        void * context;
-
-        size_t (*read)(void * ctx, void * output, size_t read_size);
-        bool    (*eof)(void * ctx);
-        void  (*close)(void * ctx);
-    } whisper_model_loader;
-
-    // Various functions for loading a ggml whisper model.
-    // Allocate (almost) all memory needed for the model.
-    // Return NULL on failure
-    WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model);
-    WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size);
-    WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader);
-
-    // These are the same as the above, but the internal state of the context is not allocated automatically
-    // It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
-    WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model);
-    WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size);
-    WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader);
-
-    WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
-
-    // Given a context, enable use of OpenVINO for encode inference.
-    // model_path: Optional path to OpenVINO encoder IR model. If set to nullptr,
-    //                      the path will be generated from the ggml model path that was passed
-    //                      in to whisper_init_from_file. For example, if 'path_model' was
-    //                      "/path/to/ggml-base.en.bin", then OpenVINO IR model path will be
-    //                      assumed to be "/path/to/ggml-base.en-encoder-openvino.xml".
-    // device: OpenVINO device to run inference on ("CPU", "GPU", etc.)
-    // cache_dir: Optional cache directory that can speed up init time, especially for
-    //                     GPU, by caching compiled 'blobs' there.
-    //                     Set to nullptr if not used.
-    // Returns 0 on success. If OpenVINO is not enabled in build, this simply returns 1.
-    WHISPER_API int whisper_ctx_init_openvino_encoder(
-        struct whisper_context * ctx,
-                    const char * model_path,
-                    const char * device,
-                    const char * cache_dir);
-
-    // Frees all allocated memory
-    WHISPER_API void whisper_free      (struct whisper_context * ctx);
-    WHISPER_API void whisper_free_state(struct whisper_state * state);
-    WHISPER_API void whisper_free_params(struct whisper_full_params * params);
-
-    // Convert RAW PCM audio to log mel spectrogram.
-    // The resulting spectrogram is stored inside the default state of the provided whisper context.
-    // Returns 0 on success
-    WHISPER_API int whisper_pcm_to_mel(
-            struct whisper_context * ctx,
-                       const float * samples,
-                               int   n_samples,
-                               int   n_threads);
-
-    WHISPER_API int whisper_pcm_to_mel_with_state(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-                       const float * samples,
-                               int   n_samples,
-                               int   n_threads);
-
-    // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
-    // The resulting spectrogram is stored inside the default state of the provided whisper context.
-    // Returns 0 on success
-    WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
-        struct whisper_context * ctx,
-                   const float * samples,
-                           int   n_samples,
-                           int   n_threads);
-
-    WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state(
-        struct whisper_context * ctx,
-          struct whisper_state * state,
-                   const float * samples,
-                           int   n_samples,
-                           int   n_threads);
-
-    // This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
-    // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
-    // n_mel must be 80
-    // Returns 0 on success
-    WHISPER_API int whisper_set_mel(
-            struct whisper_context * ctx,
-                       const float * data,
-                               int   n_len,
-                               int   n_mel);
-
-    WHISPER_API int whisper_set_mel_with_state(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-                       const float * data,
-                               int   n_len,
-                               int   n_mel);
-
-    // Run the Whisper encoder on the log mel spectrogram stored inside the default state in the provided whisper context.
-    // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
-    // offset can be used to specify the offset of the first frame in the spectrogram.
-    // Returns 0 on success
-    WHISPER_API int whisper_encode(
-            struct whisper_context * ctx,
-                               int   offset,
-                               int   n_threads);
-
-    WHISPER_API int whisper_encode_with_state(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-                               int   offset,
-                               int   n_threads);
-
-    // Run the Whisper decoder to obtain the logits and probabilities for the next token.
-    // Make sure to call whisper_encode() first.
-    // tokens + n_tokens is the provided context for the decoder.
-    // n_past is the number of tokens to use from previous decoder calls.
-    // Returns 0 on success
-    // TODO: add support for multiple decoders
-    WHISPER_API int whisper_decode(
-            struct whisper_context * ctx,
-               const whisper_token * tokens,
-                               int   n_tokens,
-                               int   n_past,
-                               int   n_threads);
-
-    WHISPER_API int whisper_decode_with_state(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-               const whisper_token * tokens,
-                               int   n_tokens,
-                               int   n_past,
-                               int   n_threads);
-
-    // Convert the provided text into tokens.
-    // The tokens pointer must be large enough to hold the resulting tokens.
-    // Returns the number of tokens on success, no more than n_max_tokens
-    // Returns -1 on failure
-    // TODO: not sure if correct
-    WHISPER_API int whisper_tokenize(
-            struct whisper_context * ctx,
-                        const char * text,
-                     whisper_token * tokens,
-                               int   n_max_tokens);
-
-    // Largest language id (i.e. number of available languages - 1)
-    WHISPER_API int whisper_lang_max_id();
-
-    // Return the id of the specified language, returns -1 if not found
-    // Examples:
-    //   "de" -> 2
-    //   "german" -> 2
-    WHISPER_API int whisper_lang_id(const char * lang);
-
-    // Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
-    WHISPER_API const char * whisper_lang_str(int id);
-
-    // Use mel data at offset_ms to try and auto-detect the spoken language
-    // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
-    // Returns the top language id or negative on failure
-    // If not null, fills the lang_probs array with the probabilities of all languages
-    // The array must be whisper_lang_max_id() + 1 in size
-    // ref: https://github.com/openai/whisper/blob/main/whisper/decoding.py#L18-L69
-    WHISPER_API int whisper_lang_auto_detect(
-            struct whisper_context * ctx,
-                               int   offset_ms,
-                               int   n_threads,
-                             float * lang_probs);
-
-    WHISPER_API int whisper_lang_auto_detect_with_state(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-                               int   offset_ms,
-                               int   n_threads,
-                             float * lang_probs);
-
-    WHISPER_API int whisper_n_len           (struct whisper_context * ctx); // mel length
-    WHISPER_API int whisper_n_len_from_state(struct whisper_state * state); // mel length
-    WHISPER_API int whisper_n_vocab         (struct whisper_context * ctx);
-    WHISPER_API int whisper_n_text_ctx      (struct whisper_context * ctx);
-    WHISPER_API int whisper_n_audio_ctx     (struct whisper_context * ctx);
-    WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx);
-
-    WHISPER_API int whisper_model_n_vocab      (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_audio_ctx  (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_text_ctx   (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_text_head  (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_mels       (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_ftype        (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_type         (struct whisper_context * ctx);
-
-    // Token logits obtained from the last call to whisper_decode()
-    // The logits for the last token are stored in the last row
-    // Rows: n_tokens
-    // Cols: n_vocab
-    WHISPER_API float * whisper_get_logits           (struct whisper_context * ctx);
-    WHISPER_API float * whisper_get_logits_from_state(struct whisper_state * state);
-
-    // Token Id -> String. Uses the vocabulary in the provided context
-    WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
-    WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx);
-
-
-    // Special tokens
-    WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
-    WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
-    WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
-    WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
-    WHISPER_API whisper_token whisper_token_nosp(struct whisper_context * ctx);
-    WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
-    WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
-    WHISPER_API whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id);
-
-    // Task tokens
-    WHISPER_API whisper_token whisper_token_translate (struct whisper_context * ctx);
-    WHISPER_API whisper_token whisper_token_transcribe(struct whisper_context * ctx);
-
-    // Performance information from the default state.
-    WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
-    WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
-
-    // Print system information
-    WHISPER_API const char * whisper_print_system_info(void);
-
-    ////////////////////////////////////////////////////////////////////////////
-
-    // Available sampling strategies
-    enum whisper_sampling_strategy {
-        WHISPER_SAMPLING_GREEDY,      // similar to OpenAI's GreedyDecoder
-        WHISPER_SAMPLING_BEAM_SEARCH, // similar to OpenAI's BeamSearchDecoder
-    };
-
-    // Text segment callback
-    // Called on every newly generated text segment
-    // Use the whisper_full_...() functions to obtain the text segments
-    typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data);
-
-    // Progress callback
-    typedef void (*whisper_progress_callback)(struct whisper_context * ctx, struct whisper_state * state, int progress, void * user_data);
-
-    // Encoder begin callback
-    // If not NULL, called before the encoder starts
-    // If it returns false, the computation is aborted
-    typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data);
-
-    // Logits filter callback
-    // Can be used to modify the logits before sampling
-    // If not NULL, called after applying temperature to logits
-    typedef void (*whisper_logits_filter_callback)(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-          const whisper_token_data * tokens,
-                               int   n_tokens,
-                             float * logits,
-                              void * user_data);
-
-    // Parameters for the whisper_full() function
-    // If you change the order or add new parameters, make sure to update the default values in whisper.cpp:
-    // whisper_full_default_params()
-    struct whisper_full_params {
-        enum whisper_sampling_strategy strategy;
-
-        int n_threads;
-        int n_max_text_ctx;     // max tokens to use from past text as prompt for the decoder
-        int offset_ms;          // start offset in ms
-        int duration_ms;        // audio duration to process in ms
-
-        bool translate;
-        bool no_context;        // do not use past transcription (if any) as initial prompt for the decoder
-        bool single_segment;    // force single segment output (useful for streaming)
-        bool print_special;     // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
-        bool print_progress;    // print progress information
-        bool print_realtime;    // print results from within whisper.cpp (avoid it, use callback instead)
-        bool print_timestamps;  // print timestamps for each text segment when printing realtime
-
-        // [EXPERIMENTAL] token-level timestamps
-        bool  token_timestamps; // enable token-level timestamps
-        float thold_pt;         // timestamp token probability threshold (~0.01)
-        float thold_ptsum;      // timestamp token sum probability threshold (~0.01)
-        int   max_len;          // max segment length in characters
-        bool  split_on_word;    // split on word rather than on token (when used with max_len)
-        int   max_tokens;       // max tokens per segment (0 = no limit)
-
-        // [EXPERIMENTAL] speed-up techniques
-        // note: these can significantly reduce the quality of the output
-        bool speed_up;          // speed-up the audio by 2x using Phase Vocoder
-        bool debug_mode;        // enable debug_mode provides extra info (eg. Dump log_mel)
-        int  audio_ctx;         // overwrite the audio context size (0 = use default)
-
-        // [EXPERIMENTAL] [TDRZ] tinydiarize
-        bool tdrz_enable;       // enable tinydiarize speaker turn detection
-
-        // tokens to provide to the whisper decoder as initial prompt
-        // these are prepended to any existing text context from a previous call
-        const char * initial_prompt;
-        const whisper_token * prompt_tokens;
-        int prompt_n_tokens;
-
-        // for auto-detection, set to nullptr, "" or "auto"
-        const char * language;
-        bool detect_language;
-
-        // common decoding parameters:
-        bool suppress_blank;    // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
-        bool suppress_non_speech_tokens; // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
-
-        float temperature;      // initial decoding temperature, ref: https://ai.stackexchange.com/a/32478
-        float max_initial_ts;   // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
-        float length_penalty;   // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L267
-
-        // fallback parameters
-        // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L274-L278
-        float temperature_inc;
-        float entropy_thold;    // similar to OpenAI's "compression_ratio_threshold"
-        float logprob_thold;
-        float no_speech_thold;  // TODO: not implemented
-
-        struct {
-            int best_of;    // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264
-        } greedy;
-
-        struct {
-            int beam_size;  // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L265
-
-            float patience; // TODO: not implemented, ref: https://arxiv.org/pdf/2204.05424.pdf
-        } beam_search;
-
-        // called for every newly generated text segment
-        whisper_new_segment_callback new_segment_callback;
-        void * new_segment_callback_user_data;
-
-        // called on each progress update
-        whisper_progress_callback progress_callback;
-        void * progress_callback_user_data;
-
-        // called each time before the encoder starts
-        whisper_encoder_begin_callback encoder_begin_callback;
-        void * encoder_begin_callback_user_data;
-
-        // called by each decoder to filter obtained logits
-        whisper_logits_filter_callback logits_filter_callback;
-        void * logits_filter_callback_user_data;
-    };
-
-    // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_params()
-    WHISPER_API struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy);
-    WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
-
-    // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
-    // Not thread safe for same context
-    // Uses the specified decoding strategy to obtain the text.
-    WHISPER_API int whisper_full(
-                struct whisper_context * ctx,
-            struct whisper_full_params   params,
-                           const float * samples,
-                                   int   n_samples);
-
-    WHISPER_API int whisper_full_with_state(
-                struct whisper_context * ctx,
-                  struct whisper_state * state,
-            struct whisper_full_params   params,
-                           const float * samples,
-                                   int   n_samples);
-
-    // Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
-    // Result is stored in the default state of the context
-    // Not thread safe if executed in parallel on the same context.
-    // It seems this approach can offer some speedup in some cases.
-    // However, the transcription accuracy can be worse at the beginning and end of each chunk.
-    WHISPER_API int whisper_full_parallel(
-                struct whisper_context * ctx,
-            struct whisper_full_params   params,
-                           const float * samples,
-                                   int   n_samples,
-                                   int   n_processors);
-
-    // Number of generated text segments
-    // A segment can be a few words, a sentence, or even a paragraph.
-    WHISPER_API int whisper_full_n_segments           (struct whisper_context * ctx);
-    WHISPER_API int whisper_full_n_segments_from_state(struct whisper_state * state);
-
-    // Language id associated with the context's default state
-    WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);
-
-    // Language id associated with the provided state
-    WHISPER_API int whisper_full_lang_id_from_state(struct whisper_state * state);
-
-    // Get the start and end time of the specified segment
-    WHISPER_API int64_t whisper_full_get_segment_t0           (struct whisper_context * ctx, int i_segment);
-    WHISPER_API int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment);
-
-    WHISPER_API int64_t whisper_full_get_segment_t1           (struct whisper_context * ctx, int i_segment);
-    WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);
-
-    // Get whether the next segment is predicted as a speaker turn
-    WHISPER_API bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment);
-
-    // Get the text of the specified segment
-    WHISPER_API const char * whisper_full_get_segment_text           (struct whisper_context * ctx, int i_segment);
-    WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);
-
-    // Get number of tokens in the specified segment
-    WHISPER_API int whisper_full_n_tokens           (struct whisper_context * ctx, int i_segment);
-    WHISPER_API int whisper_full_n_tokens_from_state(struct whisper_state * state, int i_segment);
-
-    // Get the token text of the specified token in the specified segment
-    WHISPER_API const char * whisper_full_get_token_text           (struct whisper_context * ctx, int i_segment, int i_token);
-    WHISPER_API const char * whisper_full_get_token_text_from_state(struct whisper_context * ctx, struct whisper_state * state, int i_segment, int i_token);
-
-    WHISPER_API whisper_token whisper_full_get_token_id           (struct whisper_context * ctx, int i_segment, int i_token);
-    WHISPER_API whisper_token whisper_full_get_token_id_from_state(struct whisper_state * state, int i_segment, int i_token);
-
-    // Get token data for the specified token in the specified segment
-    // This contains probabilities, timestamps, etc.
-    WHISPER_API whisper_token_data whisper_full_get_token_data           (struct whisper_context * ctx, int i_segment, int i_token);
-    WHISPER_API whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token);
-
-    // Get the probability of the specified token in the specified segment
-    WHISPER_API float whisper_full_get_token_p           (struct whisper_context * ctx, int i_segment, int i_token);
-    WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
-
-    ////////////////////////////////////////////////////////////////////////////
-
-    // Temporary helpers needed for exposing ggml interface
-
-    WHISPER_API int          whisper_bench_memcpy          (int n_threads);
-    WHISPER_API const char * whisper_bench_memcpy_str      (int n_threads);
-    WHISPER_API int          whisper_bench_ggml_mul_mat    (int n_threads);
-    WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
-
-    // Control logging output; default behavior is to print to stderr
-
-    typedef void (*whisper_log_callback)(const char * line);
-    WHISPER_API void whisper_set_log_callback(whisper_log_callback callback);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif