il y a 1 an · 48fc7dfcd5
--- a/ggml/examples/unity/model_loader.cpp
+++ b/ggml/examples/unity/model_loader.cpp
@@ -39,18 +39,15 @@ std::int64_t
 
				 model_loader::load_model_weights(fairseq2_model &model, std::ifstream &fin)
			
 
				 {
			
 
				     std::int64_t num_tensor = 0;
			
 
				-    std::int64_t f32_ctx_size = 0;
			
 
				+    std::int64_t f32_tensor_size = 0;
			
 
				     fin.read((char*) &num_tensor, sizeof(num_tensor));
			
 
				-    fin.read((char*) &f32_ctx_size, sizeof(f32_ctx_size));
			
 
				+    fin.read((char*) &f32_tensor_size, sizeof(f32_tensor_size));
			
 
				 
			
 
				     // TODO: it might be interesting to allow the caller to not upcast the weights to float32.
			
 
				     // Note this require changing the on disk format
			
 
				     bool as_float32 = true;
			
 
				-    std::int64_t f16_ctx_size = f32_ctx_size;
			
 
				-    // fin.read((char*) &f16_ctx_size, sizeof(f16_ctx_size));
			
 
				-
			
 
				     struct ggml_init_params params = {
			
 
				-        /*.mem_size   =*/ as_float32 ? f32_ctx_size : f16_ctx_size,
			
 
				+        /*.mem_size   =*/ f32_tensor_size + num_tensor * (int64_t)ggml_tensor_overhead(),
			
 
				         /*.mem_buffer =*/ NULL,
			
 
				         /*.no_alloc   =*/ false,
			
 
				     };
			
--- a/ggml/ggml_convert.py
+++ b/ggml/ggml_convert.py
@@ -219,33 +219,27 @@ def write_state_dict(
 
				         convert float32 tensors to float16 on disk
			
 
				     """
			
 
				     out.write(struct.pack("<q", len(state_dict)))
			
 
				-    # True size of each tensor
			
 
				+    # True size of each tensor (before downcasting to float16)
			
 
				     true_byte_size = sum(x.numel() * x.element_size() for x in state_dict.values())
			
 
				-    # + tensor overhead
			
 
				-    true_byte_size += ggml.ggml_tensor_overhead() * (len(state_dict) + 10)
			
 
				-
			
 
				-    def _fp16_byte_size(x: torch.Tensor) -> int:
			
 
				-        full_byte_size = x.numel() * x.element_size()
			
 
				-        if fp16 and x.dtype == torch.float32:
			
 
				-            full_byte_size //= 2
			
 
				-        return full_byte_size
			
 
				-
			
 
				-    # Compressed size
			
 
				-    compressed_byte_size = sum(_fp16_byte_size(x) for x in state_dict.values())
			
 
				-    compressed_byte_size += ggml.ggml_tensor_overhead() * (len(state_dict) + 10)
			
 
				-
			
 
				     out.write(struct.pack("<q", true_byte_size))
			
 
				-    # TODO: it could be interesting to write this to allow model_loader to chose the precision when loading.
			
 
				-    # But changing this require republishing .ggml files
			
 
				-    # out.write(struct.pack("<q", compressed_byte_size))
			
 
				+
			
 
				     GB = 1024**3
			
 
				-    if fp16:
			
 
				+    if not fp16:
			
 
				         log.warning(
			
 
				-            f"Saving a ggml file with {len(state_dict)} tensors, totalling {true_byte_size / GB:.3f}Gb compressed to {compressed_byte_size / GB:.3f}"
			
 
				+            f"Saving a ggml file with {len(state_dict)} tensors, totalling {true_byte_size / GB:.3f}Gb"
			
 
				         )
			
 
				     else:
			
 
				+
			
 
				+        def _fp16_byte_size(x: torch.Tensor) -> int:
			
 
				+            full_byte_size = x.numel() * x.element_size()
			
 
				+            if fp16 and x.dtype == torch.float32:
			
 
				+                full_byte_size //= 2
			
 
				+            return full_byte_size
			
 
				+
			
 
				+        # Compressed size
			
 
				+        compressed_byte_size = sum(_fp16_byte_size(x) for x in state_dict.values())
			
 
				         log.warning(
			
 
				-            f"Saving a ggml file with {len(state_dict)} tensors, totalling {true_byte_size / GB:.3f}Gb"
			
 
				+            f"Saving a ggml file with {len(state_dict)} tensors, totalling {true_byte_size / GB:.3f}Gb compressed to {compressed_byte_size / GB:.3f}"
			
 
				         )
			
 
				 
			
 
				     for key, value in state_dict.items():