ggml.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332
  1. """
  2. We are vendoring https://github.com/abetlen/ggml-python (MIT License)
  3. adding a few utilities to convert between ggml and numpy tensors for testing.
  4. """
  5. import numpy as np
  6. import ctypes
  7. import torch
  8. import functools
  9. from pathlib import Path
  10. from typing import Dict
  11. from typing import Callable
  12. from typing import Any
  13. from typing import Tuple
  14. from typing import Union
  15. from typing import Type
  16. from third_party_ggml import *
  17. from ctypes_utils import c_struct, c_fn, Ptr
  18. ### Helpers
  19. def numpy_dtype(ggml_type: ctypes.c_int) -> type:
  20. if ggml_type == 0:
  21. # GGML_TYPE_F32 = 0,
  22. return np.float32
  23. if ggml_type == 1:
  24. # GGML_TYPE_F16 = 1,
  25. return np.float16
  26. if ggml_type == 18:
  27. return np.int32
  28. raise NotImplementedError(f"Can't convert GGML_TYPE({ggml_type}) to a numpy.dtype")
  29. def from_numpy_dtype(dtype: np.dtype) -> ctypes.c_int:
  30. if dtype == np.float32:
  31. return ctypes.c_int(0)
  32. elif dtype == np.int32:
  33. return ctypes.c_int(18)
  34. elif dtype == np.float16:
  35. return ctypes.c_int(1)
  36. raise NotImplementedError(f"Can't convert {dtype} to a GGML_TYPE")
  37. def shape(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  38. if isinstance(tensor, ctypes._Pointer):
  39. tensor = tensor.contents
  40. ndims = tensor.n_dims
  41. return tuple([tensor.ne[i] for i in range(ndims)[::-1]])
  42. def nb(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  43. if isinstance(tensor, ctypes._Pointer):
  44. tensor = tensor.contents
  45. return tuple([tensor.nb[i] for i in range(4)])
  46. def strides(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  47. raise NotImplementedError()
  48. if isinstance(tensor, ctypes._Pointer):
  49. tensor = tensor.contents
  50. ndims = tensor.n_dims
  51. num_bytes = tuple([tensor.nb[i] for i in range(ndims)])
  52. # TODO: convert to numpy strides
  53. return num_bytes
  54. def to_numpy(tensor: Union[ggml_tensor, ggml_tensor_p]) -> np.ndarray:
  55. if isinstance(tensor, ctypes._Pointer):
  56. tensor = tensor.contents
  57. n_dim = tensor.n_dims
  58. t_shape = shape(tensor)
  59. strides = nb(tensor)[:n_dim][::-1]
  60. # Convert the ggml data pointer to a pointer to ints with the same size (float16 -> uint16)
  61. # This is needed because Python ctypes doesn't have "float16", and `as_array` only works with ctypes
  62. type_size = ggml_type_size(tensor.type)
  63. int_width: type = getattr(ctypes, f"c_uint{8 * type_size}")
  64. ptr = ctypes.cast(tensor.data, ctypes.POINTER(int_width))
  65. # Create a numpy array with the wrong dtype
  66. int_arr = np.ctypeslib.as_array(ptr, shape=t_shape)
  67. # Reinterpret it to the right dtype
  68. res = np.frombuffer(int_arr, dtype=numpy_dtype(tensor.type)).reshape(t_shape)
  69. # Patch up strides to work with transposed ggml_tensor
  70. res.strides = strides
  71. return res
  72. GgmlNElem = ctypes.c_int64 * GGML_MAX_DIMS
  73. GgmlNBytes = ctypes.c_uint64 * GGML_MAX_DIMS
  74. def from_file(
  75. ctx: ggml_context_p, file: Path, shape: Tuple[int, ...], dtype: type = np.float32
  76. ) -> ggml_tensor_p:
  77. data = np.fromfile(str(file), dtype=dtype).reshape(shape) # type: ignore
  78. return from_numpy(ctx, data)
  79. def _shape_to_ne(shape: Tuple[int, ...]) -> Tuple[int, int, int, int]:
  80. # in GGML ne[0] indicates the contiguous dimension, ie the last one in numpy and torch
  81. ne = shape[::-1]
  82. if len(ne) >= GGML_MAX_DIMS:
  83. return # type: ignore
  84. # ne is always of the same length
  85. padding = (1,) * (GGML_MAX_DIMS - len(ne))
  86. return ne + padding # type: ignore
  87. def _compute_nbytes(
  88. ne: Tuple[int, int, int, int], type: ctypes.c_int
  89. ) -> Tuple[int, int, int, int]:
  90. nb0 = ggml_type_size(type)
  91. nb1 = nb0 * (ne[0] // ggml_blck_size(type))
  92. nb2 = nb1 * ne[1]
  93. nb3 = nb2 * ne[2]
  94. return (nb0, nb1, nb2, nb3)
  95. def from_numpy(
  96. ctx: ggml_context_p, array: Union[np.ndarray, "torch.Tensor"]
  97. ) -> ggml_tensor_p:
  98. if type(array).__name__ == "Tensor":
  99. array = array.numpy()
  100. # Create an empty tensor so we don't allocate memory for the data pointer
  101. gtype = from_numpy_dtype(array.dtype)
  102. tensor_p = ggml_new_tensor_1d(ctx, gtype, 0)
  103. # Fill out the correct dimensions and shape.
  104. tensor_p.contents.n_dims = array.ndim
  105. ne = _shape_to_ne(array.shape)
  106. tensor_p.contents.ne = GgmlNElem(*ne)
  107. tensor_p.contents.nb = GgmlNBytes(*_compute_nbytes(ne, gtype))
  108. # point the tensor data to the content of the numpy array.
  109. tensor_p.contents.data = array.ctypes.data_as(ctypes.c_void_p)
  110. # print(f"array: {array.shape} @0x{array.ctypes.data_as(ctypes.c_void_p)}")
  111. # print(f"tensor_p: {shape(tensor_p)} @0x{tensor_p.contents.data:x}")
  112. # prevent the underlying numpy array to be freed
  113. setattr(tensor_p, "__data", array)
  114. return tensor_p
  115. def ggml_can_mul_mat(t0: ggml_tensor_p, t1: ggml_tensor_p) -> bool:
  116. assert GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"
  117. return (
  118. (t0.contents.ne[0] == t1.contents.ne[0])
  119. and (t1.contents.ne[2] % t0.contents.ne[2] == 0)
  120. and (t1.contents.ne[3] % t0.contents.ne[3] == 0)
  121. )
  122. class NativeObj:
  123. AllocFn = Callable[[], ctypes.c_void_p]
  124. FreeFn = Callable[[ctypes.c_void_p], None]
  125. _cache: Dict[str, Tuple[AllocFn, FreeFn]] = {}
  126. @classmethod
  127. def _init_c_func(cls, kind: str) -> Tuple[AllocFn, FreeFn]:
  128. if kind in cls._cache:
  129. return cls._cache[kind]
  130. alloc_fn = getattr(lib, f"{kind}_alloc")
  131. alloc_fn.argtypes = []
  132. alloc_fn.restype = ctypes.c_void_p
  133. free_fn = getattr(lib, f"{kind}_free")
  134. free_fn.argtypes = [ctypes.c_void_p]
  135. free_fn.restype = None
  136. cls._cache[kind] = (alloc_fn, free_fn)
  137. return (alloc_fn, free_fn)
  138. def __init__(self, kind: str, ptr: ctypes.c_void_p = NULL):
  139. self.kind = kind
  140. alloc_fn, self._free_fn = self._init_c_func(kind)
  141. self.ptr = alloc_fn() if ptr is None else ptr
  142. # print(self)
  143. def free(self) -> None:
  144. if self.ptr is not None:
  145. self._free_fn(self.ptr)
  146. # print(f"freeing {self}")
  147. self.ptr = NULL
  148. def __enter__(self) -> ctypes.c_void_p:
  149. return self.ptr
  150. def __exit__(self, *args: Any) -> None:
  151. self.free()
  152. def __del__(self) -> None:
  153. self.free()
  154. def __repr__(self) -> str:
  155. return f"<{self.kind} native object at 0x{self.ptr:x}>"
  156. def MeasureArena() -> NativeObj:
  157. return NativeObj("ggml_allocr", ggml_allocr_new_measure(GGML_MEM_ALIGN))
  158. def FixedSizeArena(mem_size: int) -> NativeObj:
  159. memory = torch.zeros(mem_size, dtype=torch.uint8)
  160. allocr = ggml_allocr_new(
  161. ctypes.c_void_p(memory.data_ptr()), mem_size, GGML_MEM_ALIGN
  162. )
  163. arena = NativeObj("ggml_allocr", allocr)
  164. # Add a reference from the arena object to the underlying tensor, otherwise it will be freed to early.
  165. setattr(arena, "__memory", memory)
  166. return arena
  167. lib.fairseq2_model_set_inference_ctx.argtypes = [ctypes.c_void_p, ggml_context_p]
  168. def Fairseq2Model() -> NativeObj:
  169. return NativeObj("fairseq2_model")
  170. lib.std_string_alloc.argtypes = [ctypes.c_char_p]
  171. lib.std_string_alloc.restype = ctypes.c_void_p
  172. lib.std_string_free.argtypes = [ctypes.c_void_p]
  173. lib.std_string_free.restype = None
  174. NativeObj._cache["std_string"] = (lib.std_string_alloc, lib.std_string_free)
  175. @functools.lru_cache(1024)
  176. def CppStr(content: str) -> NativeObj:
  177. c_str = ctypes.create_string_buffer(content.encode("utf-8"))
  178. cpp_str = lib.std_string_alloc(c_str)
  179. return NativeObj("std_string", cpp_str)
  180. lib.load_unity_ggml_file.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
  181. lib.load_unity_ggml_file.restype = ctypes.c_int
  182. def load_unity_ggml_file(model_file: Path) -> NativeObj:
  183. model = Fairseq2Model()
  184. bytes_file = ctypes.create_string_buffer(str(model_file).encode("utf-8"))
  185. err = lib.load_unity_ggml_file(model.ptr, bytes_file)
  186. if err:
  187. raise Exception("Failed to load model")
  188. return model
  189. # lib.unity_audio_encoder_graph.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
  190. # lib.unity_audio_encoder_graph.restype = ctypes.POINTER(ggml_cgraph)
  191. # def unity_audio_encoder_graph(model: NativeObj, tensor: ggml_tensor_p) -> ggml_cgraph_p:
  192. # return lib.unity_audio_encoder_graph(model.ptr, tensor) # type: ignore
  193. # lib.unity_eval.argtypes = [
  194. # ctypes.c_void_p,
  195. # ctypes.c_void_p,
  196. # ctypes.POINTER(ggml_tensor),
  197. # ctypes.c_int,
  198. # ]
  199. # lib.unity_eval.restype = ctypes.POINTER(ggml_cgraph)
  200. # def unity_eval(
  201. # allocr: ctypes.c_void_p, model: NativeObj, tensor: ggml_tensor_p, n_threads: int
  202. # ) -> ggml_cgraph_p:
  203. # return lib.unity_eval(allocr, model.ptr, tensor, n_threads)
  204. _FORWARD_CACHE: Dict[str, Callable[..., ggml_tensor_p]] = {}
  205. def forward(
  206. layer_name: str, model: ctypes.c_void_p, prefix: str, *inputs: ggml_tensor_p
  207. ) -> ggml_tensor_p:
  208. fwd: Any = _FORWARD_CACHE.get(layer_name)
  209. if fwd is None:
  210. fwd = getattr(lib, layer_name + "_forward")
  211. num_inputs = len(inputs)
  212. fwd.argtypes = [ctypes.c_void_p, ctypes.c_void_p] + [
  213. ctypes.POINTER(ggml_tensor)
  214. ] * num_inputs
  215. fwd.restype = ctypes.POINTER(ggml_tensor)
  216. _FORWARD_CACHE[layer_name] = fwd
  217. with CppStr(prefix) as std_prefix:
  218. return fwd(model, std_prefix, *inputs) # ignore: type[no-any-return]
  219. @c_fn(lib)
  220. def causal_attention_mask(
  221. ctx: ggml_context_p, seqs: Ptr[ggml_tensor]
  222. ) -> Ptr[ggml_tensor]:
  223. return lib.causal_attention_mask(ctx, seqs) # type: ignore[no-any-return]
  224. @c_struct
  225. class SequenceGeneratorOptions:
  226. beam_size: int
  227. min_seq_len: int
  228. soft_max_seq_len_a: int
  229. soft_max_seq_len_b: int
  230. hard_max_seq_len: int
  231. len_penalty: float
  232. unk_penalty: float
  233. normalize_scores: bool
  234. @c_struct
  235. class SequenceGeneratorJob:
  236. opts: SequenceGeneratorOptions
  237. prefix_seq: Ptr[ggml_tensor]
  238. eos_idx: int
  239. @c_fn(lib)
  240. def generate_sequence(
  241. model: ctypes.c_void_p,
  242. job: Ptr[SequenceGeneratorJob],
  243. encoder_output: Ptr[ggml_tensor],
  244. encoder_padding_mask: Ptr[ggml_tensor],
  245. output_seq: Ptr[ggml_tensor],
  246. ) -> float:
  247. ...