ggml.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. """
  2. We are vendoring https://github.com/abetlen/ggml-python (MIT License)
  3. adding a few utilities to convert between ggml and numpy tensors for testing.
  4. """
  5. import numpy as np
  6. import ctypes
  7. import torch
  8. import functools
  9. from pathlib import Path
  10. from typing import Dict
  11. from typing import Callable
  12. from typing import Any
  13. from typing import Tuple
  14. from typing import Union
  15. from typing import Type
  16. from third_party_ggml import *
  17. ### Helpers
  18. def numpy_dtype(ggml_type: ctypes.c_int) -> type:
  19. if ggml_type == 0:
  20. # GGML_TYPE_F32 = 0,
  21. return np.float32
  22. if ggml_type == 1:
  23. # GGML_TYPE_F16 = 1,
  24. return np.float16
  25. raise NotImplementedError(f"Can't convert GGML_TYPE({ggml_type}) to a numpy.dtype")
  26. def from_numpy_dtype(dtype: np.dtype) -> ctypes.c_int:
  27. if dtype == np.float32:
  28. return ctypes.c_int(0)
  29. elif dtype == np.float16:
  30. return ctypes.c_int(1)
  31. raise NotImplementedError(f"Can't convert {dtype} to a GGML_TYPE")
  32. def shape(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  33. if isinstance(tensor, ctypes._Pointer):
  34. tensor = tensor.contents
  35. ndims = tensor.n_dims
  36. return tuple([tensor.ne[i] for i in range(ndims)])
  37. def nb(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  38. if isinstance(tensor, ctypes._Pointer):
  39. tensor = tensor.contents
  40. return tuple([tensor.nb[i] for i in range(4)])
  41. def strides(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  42. raise NotImplementedError()
  43. if isinstance(tensor, ctypes._Pointer):
  44. tensor = tensor.contents
  45. ndims = tensor.n_dims
  46. num_bytes = tuple([tensor.nb[i] for i in range(ndims)])
  47. # TODO: convert to numpy strides
  48. return num_bytes
  49. def to_numpy(tensor: Union[ggml_tensor, ggml_tensor_p]) -> np.ndarray:
  50. if isinstance(tensor, ctypes._Pointer):
  51. tensor = tensor.contents
  52. t_shape = shape(tensor)
  53. # Convert the ggml data pointer to a pointer to ints with the same size (float16 -> uint16)
  54. # This is needed because Python ctypes doesn't have "float16", and as_array only works with ctypes pointer
  55. type_size = ggml_type_size(tensor.type)
  56. int_width: type = getattr(ctypes, f"c_uint{8 * type_size}")
  57. ptr = ctypes.cast(tensor.data, ctypes.POINTER(int_width))
  58. # Create a numpy array with the wrong dtype
  59. int_arr = np.ctypeslib.as_array(ptr, shape=t_shape)
  60. # Reinterpret it to the right dtype
  61. res = np.frombuffer(int_arr, dtype=numpy_dtype(tensor.type)).reshape(t_shape)
  62. # TODO: assert strides / check contiguous
  63. # assert strides(tensor) == res.strides, "TODO: support strided tensor"
  64. return res
  65. GgmlShape = ctypes.c_int64 * GGML_MAX_DIMS
  66. GgmlNBytes = ctypes.c_uint64 * GGML_MAX_DIMS
  67. def from_file(
  68. ctx: ggml_context_p, file: Path, shape: Tuple[int, ...], dtype: type = np.float32
  69. ) -> ggml_tensor_p:
  70. data = np.fromfile(str(file), dtype=dtype).reshape(shape) # type: ignore
  71. return from_numpy(ctx, data)
  72. def _pad_shape(shape: Tuple[int, ...]) -> Tuple[int, int, int, int]:
  73. if len(shape) >= 4:
  74. return shape # type: ignore
  75. padding = (1,) * (4 - len(shape))
  76. return shape + padding # type: ignore
  77. def _compute_nbytes(
  78. ne: Tuple[int, int, int, int], type: ctypes.c_int
  79. ) -> Tuple[int, int, int, int]:
  80. nb0 = ggml_type_size(type)
  81. nb1 = nb0 * (ne[0] // ggml_blck_size(type))
  82. nb2 = nb1 * ne[1]
  83. nb3 = nb2 * ne[2]
  84. return (nb0, nb1, nb2, nb3)
  85. def from_numpy(
  86. ctx: ggml_context_p, array: Union[np.ndarray, "torch.Tensor"]
  87. ) -> ggml_tensor_p:
  88. if type(array).__name__ == "Tensor":
  89. array = array.numpy()
  90. # Create an empty tensor so we don't allocate memory for the data pointer
  91. gtype = from_numpy_dtype(array.dtype)
  92. tensor_p = ggml_new_tensor_1d(ctx, gtype, 0)
  93. # Fill out the correct dimensions and shape.
  94. tensor_p.contents.n_dims = array.ndim
  95. shape = _pad_shape(array.shape)
  96. tensor_p.contents.ne = GgmlShape(*shape)
  97. tensor_p.contents.nb = GgmlNBytes(*_compute_nbytes(shape, gtype))
  98. # point the tensor data to the content of the numpy array.
  99. tensor_p.contents.data = array.ctypes.data_as(ctypes.c_void_p)
  100. # print(f"array: {array.shape} @0x{array.ctypes.data_as(ctypes.c_void_p)}")
  101. # print(f"tensor_p: {shape(tensor_p)} @0x{tensor_p.contents.data:x}")
  102. # prevent the underlying numpy array to be freed
  103. setattr(tensor_p, "__data", array)
  104. return tensor_p
  105. class NativeObj:
  106. AllocFn = Callable[[], ctypes.c_void_p]
  107. FreeFn = Callable[[ctypes.c_void_p], None]
  108. _cache: Dict[str, Tuple[AllocFn, FreeFn]] = {}
  109. @classmethod
  110. def _init_c_func(cls, kind: str) -> Tuple[AllocFn, FreeFn]:
  111. if kind in cls._cache:
  112. return cls._cache[kind]
  113. alloc_fn = getattr(lib, f"{kind}_alloc")
  114. alloc_fn.argtypes = []
  115. alloc_fn.restype = ctypes.c_void_p
  116. free_fn = getattr(lib, f"{kind}_free")
  117. free_fn.argtypes = [ctypes.c_void_p]
  118. free_fn.restype = None
  119. cls._cache[kind] = (alloc_fn, free_fn)
  120. return (alloc_fn, free_fn)
  121. def __init__(self, kind: str, ptr: ctypes.c_void_p = NULL):
  122. self.kind = kind
  123. alloc_fn, self._free_fn = self._init_c_func(kind)
  124. self.ptr = alloc_fn() if ptr is None else ptr
  125. # print(self)
  126. def free(self) -> None:
  127. if self.ptr is not None:
  128. self._free_fn(self.ptr)
  129. # print(f"freeing {self}")
  130. self.ptr = NULL
  131. def __enter__(self) -> ctypes.c_void_p:
  132. return self.ptr
  133. def __exit__(self, *args: Any) -> None:
  134. self.free()
  135. def __del__(self) -> None:
  136. self.free()
  137. def __repr__(self) -> str:
  138. return f"<{self.kind} native object at 0x{self.ptr:x}>"
  139. def MeasureArena() -> NativeObj:
  140. return NativeObj("ggml_allocr", ggml_allocr_new_measure(GGML_MEM_ALIGN))
  141. def FixedSizeArena(mem_size: int) -> NativeObj:
  142. memory = torch.zeros(mem_size, dtype=torch.uint8)
  143. allocr = ggml_allocr_new(
  144. ctypes.c_void_p(memory.data_ptr()), mem_size, GGML_MEM_ALIGN
  145. )
  146. arena = NativeObj("ggml_allocr", allocr)
  147. # Add a reference from the arena object to the underlying tensor, otherwise it will be freed to early.
  148. setattr(arena, "__memory", memory)
  149. return arena
  150. def UnityModel() -> NativeObj:
  151. return NativeObj("unity_model")
  152. def GptVocab() -> NativeObj:
  153. return NativeObj("gpt_vocab")
  154. def Fairseq2Model() -> NativeObj:
  155. return NativeObj("fairseq2_model")
  156. lib.std_string_alloc.argtypes = [ctypes.c_char_p]
  157. lib.std_string_alloc.restype = ctypes.c_void_p
  158. lib.std_string_free.argtypes = [ctypes.c_void_p]
  159. lib.std_string_free.restype = None
  160. NativeObj._cache["std_string"] = (lib.std_string_alloc, lib.std_string_free)
  161. @functools.lru_cache(1024)
  162. def CppStr(content: str) -> NativeObj:
  163. c_str = ctypes.create_string_buffer(content.encode("utf-8"))
  164. cpp_str = lib.std_string_alloc(c_str)
  165. return NativeObj("std_string", cpp_str)
  166. lib.unity_model_load.argtypes = [ctypes.c_char_p, ctypes.c_void_p, ctypes.c_void_p]
  167. def unity_model_load(model_file: Path) -> Tuple[NativeObj, NativeObj]:
  168. model = UnityModel()
  169. vocab = GptVocab()
  170. lib.unity_model_load(
  171. ctypes.create_string_buffer(str(model_file).encode("utf-8")),
  172. model.ptr,
  173. vocab.ptr,
  174. )
  175. return model, vocab
  176. lib.load_unity_ggml_file.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
  177. lib.load_unity_ggml_file.restype = ctypes.c_int
  178. def load_unity_ggml_file(model_file: Path) -> NativeObj:
  179. model = Fairseq2Model()
  180. bytes_file = ctypes.create_string_buffer(str(model_file).encode("utf-8"))
  181. err = lib.load_unity_ggml_file(model.ptr, bytes_file)
  182. if err:
  183. raise Exception("Failed to load model")
  184. return model
  185. lib.unity_audio_encoder_graph.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
  186. lib.unity_audio_encoder_graph.restype = ctypes.POINTER(ggml_cgraph)
  187. def unity_audio_encoder_graph(model: NativeObj, tensor: ggml_tensor_p) -> ggml_cgraph_p:
  188. return lib.unity_audio_encoder_graph(model.ptr, tensor) # type: ignore
  189. lib.unity_eval.argtypes = [
  190. ctypes.c_void_p,
  191. ctypes.c_void_p,
  192. ctypes.POINTER(ggml_tensor),
  193. ctypes.c_int,
  194. ]
  195. lib.unity_eval.restype = ctypes.POINTER(ggml_cgraph)
  196. def unity_eval(
  197. allocr: ctypes.c_void_p, model: NativeObj, tensor: ggml_tensor_p, n_threads: int
  198. ) -> ggml_cgraph_p:
  199. return lib.unity_eval(allocr, model.ptr, tensor, n_threads)
  200. _FORWARD_CACHE: Dict[str, Callable[..., ggml_tensor_p]] = {}
  201. def forward(
  202. layer_name: str, model: NativeObj, prefix: str, *inputs: ggml_tensor_p
  203. ) -> ggml_tensor_p:
  204. fwd: Any = _FORWARD_CACHE.get(layer_name)
  205. if fwd is None:
  206. fwd = getattr(lib, layer_name + "_forward")
  207. num_inputs = len(inputs)
  208. fwd.argtypes = [ctypes.c_void_p, ctypes.c_void_p] + [
  209. ctypes.POINTER(ggml_tensor)
  210. ] * num_inputs
  211. fwd.restype = ctypes.POINTER(ggml_tensor)
  212. _FORWARD_CACHE[layer_name] = fwd
  213. with CppStr(prefix) as std_prefix:
  214. return fwd(model.ptr, std_prefix, *inputs) # ignore: type[no-any-return]