1 year ago · 48fc7dfcd5
--- a/ggml/examples/unity/model_loader.cpp
+++ b/ggml/examples/unity/model_loader.cpp
@@ -39,18 +39,15 @@ std::int64_t
 
															 model_loader::load_model_weights(fairseq2_model &model, std::ifstream &fin)
														
 
															 {
														
 
															     std::int64_t num_tensor = 0;
														
 
															-    std::int64_t f32_ctx_size = 0;
														
 
															+    std::int64_t f32_tensor_size = 0;
														
 
															     fin.read((char*) &num_tensor, sizeof(num_tensor));
														
 
															-    fin.read((char*) &f32_ctx_size, sizeof(f32_ctx_size));
														
 
															+    fin.read((char*) &f32_tensor_size, sizeof(f32_tensor_size));
														
 
															     // TODO: it might be interesting to allow the caller to not upcast the weights to float32.
														
 
															     // Note this require changing the on disk format
														
 
															     bool as_float32 = true;
														
 
															-    std::int64_t f16_ctx_size = f32_ctx_size;
														
 
															-    // fin.read((char*) &f16_ctx_size, sizeof(f16_ctx_size));
														
 
															-
														
 
															     struct ggml_init_params params = {
														
 
															-        /*.mem_size   =*/ as_float32 ? f32_ctx_size : f16_ctx_size,
														
 
															+        /*.mem_size   =*/ f32_tensor_size + num_tensor * (int64_t)ggml_tensor_overhead(),
														
 
															         /*.mem_buffer =*/ NULL,
														
 
															         /*.no_alloc   =*/ false,
														
 
															     };
														
--- a/ggml/ggml_convert.py
+++ b/ggml/ggml_convert.py
@@ -219,33 +219,27 @@ def write_state_dict(
 
															         convert float32 tensors to float16 on disk
														
 
															     """
														
 
															     out.write(struct.pack("<q", len(state_dict)))
														
 
															-    # True size of each tensor
														
 
															+    # True size of each tensor (before downcasting to float16)
														
 
															     true_byte_size = sum(x.numel() * x.element_size() for x in state_dict.values())
														
 
															-    # + tensor overhead
														
 
															-    true_byte_size += ggml.ggml_tensor_overhead() * (len(state_dict) + 10)
														
 
															-
														
 
															-    def _fp16_byte_size(x: torch.Tensor) -> int:
														
 
															-        full_byte_size = x.numel() * x.element_size()
														
 
															-        if fp16 and x.dtype == torch.float32:
														
 
															-            full_byte_size //= 2
														
 
															-        return full_byte_size
														
 
															-
														
 
															-    # Compressed size
														
 
															-    compressed_byte_size = sum(_fp16_byte_size(x) for x in state_dict.values())
														
 
															-    compressed_byte_size += ggml.ggml_tensor_overhead() * (len(state_dict) + 10)
														
 
															-
														
 
															     out.write(struct.pack("<q", true_byte_size))
														
 
															-    # TODO: it could be interesting to write this to allow model_loader to chose the precision when loading.
														
 
															-    # But changing this require republishing .ggml files
														
 
															-    # out.write(struct.pack("<q", compressed_byte_size))
														
 
															+
														
 
															     GB = 1024**3
														
 
															-    if fp16:
														
 
															+    if not fp16:
														
 
															         log.warning(
														
 
															-            f"Saving a ggml file with {len(state_dict)} tensors, totalling {true_byte_size / GB:.3f}Gb compressed to {compressed_byte_size / GB:.3f}"
														
 
															+            f"Saving a ggml file with {len(state_dict)} tensors, totalling {true_byte_size / GB:.3f}Gb"
														
 
															         )
														
 
															     else:
														
 
															+
														
 
															+        def _fp16_byte_size(x: torch.Tensor) -> int:
														
 
															+            full_byte_size = x.numel() * x.element_size()
														
 
															+            if fp16 and x.dtype == torch.float32:
														
 
															+                full_byte_size //= 2
														
 
															+            return full_byte_size
														
 
															+
														
 
															+        # Compressed size
														
 
															+        compressed_byte_size = sum(_fp16_byte_size(x) for x in state_dict.values())
														
 
															         log.warning(
														
 
															-            f"Saving a ggml file with {len(state_dict)} tensors, totalling {true_byte_size / GB:.3f}Gb"
														
 
															+            f"Saving a ggml file with {len(state_dict)} tensors, totalling {true_byte_size / GB:.3f}Gb compressed to {compressed_byte_size / GB:.3f}"
														
 
															         )
														
 
															     for key, value in state_dict.items():