ggml.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457
  1. """
  2. We are vendoring https://github.com/abetlen/ggml-python (MIT License)
  3. adding a few utilities to convert between ggml and numpy tensors for testing.
  4. """
  5. import numpy as np
  6. import ctypes
  7. import torch
  8. import functools
  9. from pathlib import Path
  10. from typing import Dict
  11. from typing import Callable
  12. from typing import Any
  13. from typing import Tuple
  14. from typing import Union
  15. from typing import Type
  16. from third_party_ggml import *
  17. from ctypes_utils import c_struct, c_fn, Ptr
  18. ### Helpers
  19. @functools.lru_cache(4)
  20. def numpy_dtype(ggml_type: ctypes.c_int) -> np.dtype:
  21. if ggml_type == 0:
  22. # GGML_TYPE_F32 = 0,
  23. return np.dtype(np.float32)
  24. if ggml_type == 1:
  25. # GGML_TYPE_F16 = 1,
  26. return np.dtype(np.float16)
  27. if ggml_type == 18:
  28. return np.dtype(np.int32)
  29. raise NotImplementedError(f"Can't convert GGML_TYPE({ggml_type}) to a numpy.dtype")
  30. def from_numpy_dtype(dtype: np.dtype) -> ctypes.c_int:
  31. if dtype == np.float32:
  32. return ctypes.c_int(0)
  33. elif dtype == np.int32:
  34. return ctypes.c_int(18)
  35. elif dtype == np.float16:
  36. return ctypes.c_int(1)
  37. raise NotImplementedError(f"Can't convert {dtype} to a GGML_TYPE")
  38. def shape(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  39. if isinstance(tensor, ctypes._Pointer):
  40. tensor = tensor.contents
  41. ndims = tensor.n_dims
  42. return tuple([tensor.ne[i] for i in range(ndims)[::-1]])
  43. def nb(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  44. if isinstance(tensor, ctypes._Pointer):
  45. tensor = tensor.contents
  46. return tuple([tensor.nb[i] for i in range(4)])
  47. def ne(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  48. if isinstance(tensor, ctypes._Pointer):
  49. tensor = tensor.contents
  50. return tuple([tensor.ne[i] for i in range(4)])
  51. def strides(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  52. if isinstance(tensor, ctypes._Pointer):
  53. tensor = tensor.contents
  54. ndims = tensor.n_dims
  55. num_bytes = tuple([tensor.nb[i] for i in range(ndims)])
  56. strides = num_bytes[::-1]
  57. return strides
  58. def to_numpy(tensor_p: ggml_tensor_p) -> np.ndarray:
  59. if not ggml_is_contiguous(tensor_p):
  60. if not _almost_contiguous(tensor_p):
  61. return _strided_to_numpy(tensor_p)
  62. tensor = tensor_p.contents
  63. res = _void_p_to_np_array(tensor.data, shape(tensor), numpy_dtype(tensor.type))
  64. if ggml_is_transposed(tensor_p):
  65. # Patch up strides to work with transposed ggml_tensor
  66. res.strides = strides(tensor) # type: ignore[assignment]
  67. return res
  68. def _almost_contiguous(tensor_p: ggml_tensor_p) -> bool:
  69. """Distinguishes between fully strided and just transposed."""
  70. tensor = tensor_p.contents
  71. num_bytes = nb(tensor)
  72. num_elem = ne(tensor)
  73. # Sort the axis according to 'num_bytes'
  74. nbe = sorted(zip(num_bytes, num_elem))
  75. itemsize = ggml_type_size(tensor.type)
  76. stride_exp = itemsize
  77. for stride, e in nbe:
  78. if stride != stride_exp:
  79. return False
  80. stride_exp *= e
  81. return True
  82. def _strided_to_numpy(tensor_p: ggml_tensor_p) -> np.ndarray:
  83. if ggml_is_transposed(tensor_p):
  84. raise NotImplementedError(
  85. "to_numpy doesn't support tensors both transposed and strided."
  86. )
  87. tensor = tensor_p.contents
  88. n_dim = tensor.n_dims
  89. t_shape = shape(tensor)
  90. t_strides = strides(tensor)
  91. type_size = ggml_type_size(tensor.type)
  92. full_shape = []
  93. num_bytes = nb(tensor)
  94. # Determine the full backing slice of bytes to read.
  95. # TODO make this work for transposed array
  96. n = 1
  97. total_elements = 1
  98. for d in range(n_dim - 1):
  99. n = num_bytes[d + 1] // type_size // n
  100. full_shape.append(n)
  101. total_elements *= n
  102. # We don't need to guess for the first dimension, since this doesn't impact striding.
  103. full_shape.append(t_shape[0])
  104. total_elements *= t_shape[0]
  105. full_shape = full_shape[::-1]
  106. res = _void_p_to_np_array(tensor.data, tuple(full_shape), numpy_dtype(tensor.type))
  107. # Extract the correct slice
  108. res = res.__getitem__(tuple(slice(0, n) for n in t_shape))
  109. # TODO: we could handle transposition here
  110. return res
  111. def _void_p_to_np_array(
  112. data: ctypes.c_void_p, shape: Tuple[int, ...], dtype: np.dtype
  113. ) -> np.ndarray:
  114. # Convert the ggml data pointer to a pointer of bytes
  115. # This is needed because Python ctypes doesn't have "float16", and `as_array` only works with ctypes
  116. int_width: type = getattr(ctypes, f"c_uint{8 * dtype.itemsize}")
  117. ptr = ctypes.cast(data, ctypes.POINTER(int_width))
  118. # Create a numpy array with the wrong dtype
  119. int_arr = np.ctypeslib.as_array(ptr, shape=shape)
  120. # Reinterpret it to the right dtype
  121. return np.frombuffer(int_arr, dtype=dtype).reshape(shape)
  122. GgmlNElem = ctypes.c_int64 * GGML_MAX_DIMS
  123. GgmlNBytes = ctypes.c_uint64 * GGML_MAX_DIMS
  124. def from_file(
  125. ctx: ggml_context_p, file: Path, shape: Tuple[int, ...], dtype: type = np.float32
  126. ) -> ggml_tensor_p:
  127. data = np.fromfile(str(file), dtype=dtype).reshape(shape) # type: ignore
  128. return from_numpy(ctx, data)
  129. def _shape_to_ne(shape: Tuple[int, ...]) -> Tuple[int, int, int, int]:
  130. # in GGML ne[0] indicates the contiguous dimension, ie the last one in numpy and torch
  131. ne = shape[::-1]
  132. if len(ne) >= GGML_MAX_DIMS:
  133. return ne # type: ignore
  134. # ne is always of the same length
  135. padding = (1,) * (GGML_MAX_DIMS - len(ne))
  136. return ne + padding # type: ignore
  137. def _compute_nbytes(
  138. ne: Tuple[int, int, int, int], type: ctypes.c_int
  139. ) -> Tuple[int, int, int, int]:
  140. nb0 = ggml_type_size(type)
  141. nb1 = nb0 * (ne[0] // ggml_blck_size(type))
  142. nb2 = nb1 * ne[1]
  143. nb3 = nb2 * ne[2]
  144. return (nb0, nb1, nb2, nb3)
  145. def from_numpy(
  146. ctx: ggml_context_p, array: Union[np.ndarray, "torch.Tensor"]
  147. ) -> ggml_tensor_p:
  148. if type(array).__name__ == "Tensor":
  149. array = array.numpy()
  150. # Create an empty tensor so we don't allocate memory for the data pointer
  151. gtype = from_numpy_dtype(array.dtype)
  152. tensor_p = ggml_new_tensor_1d(ctx, gtype, 0)
  153. # Fill out the correct dimensions and shape.
  154. tensor_p.contents.n_dims = array.ndim
  155. ne = _shape_to_ne(array.shape)
  156. tensor_p.contents.ne = GgmlNElem(*ne)
  157. tensor_p.contents.nb = GgmlNBytes(*_compute_nbytes(ne, gtype))
  158. # point the tensor data to the content of the numpy array.
  159. tensor_p.contents.data = array.ctypes.data_as(ctypes.c_void_p)
  160. # print(f"array: {array.shape} @0x{array.ctypes.data_as(ctypes.c_void_p)}")
  161. # print(f"tensor_p: {shape(tensor_p)} @0x{tensor_p.contents.data:x}")
  162. # prevent the underlying numpy array to be freed
  163. setattr(tensor_p, "__data", array)
  164. return tensor_p
  165. def ggml_can_mul_mat(t0: ggml_tensor_p, t1: ggml_tensor_p) -> bool:
  166. assert GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"
  167. return (
  168. (t0.contents.ne[0] == t1.contents.ne[0])
  169. and (t1.contents.ne[2] % t0.contents.ne[2] == 0)
  170. and (t1.contents.ne[3] % t0.contents.ne[3] == 0)
  171. )
  172. class NativeObj:
  173. AllocFn = Callable[[], ctypes.c_void_p]
  174. FreeFn = Callable[[ctypes.c_void_p], None]
  175. _cache: Dict[str, Tuple[AllocFn, FreeFn]] = {}
  176. @classmethod
  177. def _init_c_func(cls, kind: str) -> Tuple[AllocFn, FreeFn]:
  178. if kind in cls._cache:
  179. return cls._cache[kind]
  180. alloc_fn = getattr(lib, f"{kind}_alloc")
  181. alloc_fn.argtypes = []
  182. alloc_fn.restype = ctypes.c_void_p
  183. free_fn = getattr(lib, f"{kind}_free")
  184. free_fn.argtypes = [ctypes.c_void_p]
  185. free_fn.restype = None
  186. cls._cache[kind] = (alloc_fn, free_fn)
  187. return (alloc_fn, free_fn)
  188. def __init__(self, kind: str, ptr: ctypes.c_void_p = NULL):
  189. self.kind = kind
  190. alloc_fn, self._free_fn = self._init_c_func(kind)
  191. self.ptr = alloc_fn() if ptr is None else ptr
  192. # print(self)
  193. def free(self) -> None:
  194. if self.ptr is not None:
  195. self._free_fn(self.ptr)
  196. # print(f"freeing {self}")
  197. self.ptr = NULL
  198. def __enter__(self) -> ctypes.c_void_p:
  199. return self.ptr
  200. def __exit__(self, *args: Any) -> None:
  201. self.free()
  202. def __del__(self) -> None:
  203. self.free()
  204. def __repr__(self) -> str:
  205. return f"<{self.kind} native object at 0x{self.ptr:x}>"
  206. def MeasureArena() -> NativeObj:
  207. return NativeObj("ggml_allocr", ggml_allocr_new_measure(GGML_MEM_ALIGN))
  208. def FixedSizeArena(mem_size: int) -> NativeObj:
  209. memory = torch.zeros(mem_size, dtype=torch.uint8)
  210. allocr = ggml_allocr_new(
  211. ctypes.c_void_p(memory.data_ptr()), mem_size, GGML_MEM_ALIGN
  212. )
  213. arena = NativeObj("ggml_allocr", allocr)
  214. # Add a reference from the arena object to the underlying tensor, otherwise it will be freed to early.
  215. setattr(arena, "__memory", memory)
  216. return arena
  217. lib.fairseq2_model_set_inference_ctx.argtypes = [ctypes.c_void_p, ggml_context_p]
  218. def Fairseq2Model() -> NativeObj:
  219. return NativeObj("fairseq2_model")
  220. lib.std_string_alloc.argtypes = [ctypes.c_char_p]
  221. lib.std_string_alloc.restype = ctypes.c_void_p
  222. lib.std_string_free.argtypes = [ctypes.c_void_p]
  223. lib.std_string_free.restype = None
  224. NativeObj._cache["std_string"] = (lib.std_string_alloc, lib.std_string_free)
  225. @functools.lru_cache(1024)
  226. def CppStr(content: str) -> NativeObj:
  227. c_str = ctypes.create_string_buffer(content.encode("utf-8"))
  228. cpp_str = lib.std_string_alloc(c_str)
  229. return NativeObj("std_string", cpp_str)
  230. lib.load_fairseq2_ggml_file.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
  231. lib.load_fairseq2_ggml_file.restype = ctypes.c_int
  232. def load_fairseq2_ggml_file(model_file: Path) -> NativeObj:
  233. model = Fairseq2Model()
  234. bytes_file = ctypes.create_string_buffer(str(model_file).encode("utf-8"))
  235. err = lib.load_fairseq2_ggml_file(model.ptr, bytes_file)
  236. if err:
  237. raise Exception("Failed to load model")
  238. return model
  239. # lib.unity_audio_encoder_graph.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
  240. # lib.unity_audio_encoder_graph.restype = ctypes.POINTER(ggml_cgraph)
  241. # def unity_audio_encoder_graph(model: NativeObj, tensor: ggml_tensor_p) -> ggml_cgraph_p:
  242. # return lib.unity_audio_encoder_graph(model.ptr, tensor) # type: ignore
  243. # lib.unity_eval.argtypes = [
  244. # ctypes.c_void_p,
  245. # ctypes.c_void_p,
  246. # ctypes.POINTER(ggml_tensor),
  247. # ctypes.c_int,
  248. # ]
  249. # lib.unity_eval.restype = ctypes.POINTER(ggml_cgraph)
  250. # def unity_eval(
  251. # allocr: ctypes.c_void_p, model: NativeObj, tensor: ggml_tensor_p, n_threads: int
  252. # ) -> ggml_cgraph_p:
  253. # return lib.unity_eval(allocr, model.ptr, tensor, n_threads)
  254. _FORWARD_CACHE: Dict[str, Callable[..., ggml_tensor_p]] = {}
  255. def forward(
  256. layer_name: str, model: ctypes.c_void_p, prefix: str, *inputs: ggml_tensor_p
  257. ) -> ggml_tensor_p:
  258. fwd: Any = _FORWARD_CACHE.get(layer_name)
  259. if fwd is None:
  260. fwd = getattr(lib, layer_name + "_forward")
  261. num_inputs = len(inputs)
  262. fwd.argtypes = [ctypes.c_void_p, ctypes.c_void_p] + [
  263. ctypes.POINTER(ggml_tensor)
  264. ] * num_inputs
  265. fwd.restype = ctypes.POINTER(ggml_tensor)
  266. _FORWARD_CACHE[layer_name] = fwd
  267. with CppStr(prefix) as std_prefix:
  268. return fwd(model, std_prefix, *inputs) # ignore: type[no-any-return]
  269. def build_and_compute(
  270. ctx: ggml_context_p, tensor: ggml_tensor_p, num_threads: int = 1
  271. ) -> None:
  272. gf = ggml_build_forward(tensor)
  273. ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), num_threads)
  274. @c_fn(lib)
  275. def causal_attention_mask(
  276. ctx: ggml_context_p, seqs: Ptr[ggml_tensor]
  277. ) -> Ptr[ggml_tensor]:
  278. ...
  279. @c_fn(lib)
  280. def ggml_slice(
  281. ctx: ggml_context_p,
  282. a: Ptr[ggml_tensor],
  283. axis: int,
  284. start: ctypes.c_int64,
  285. end: ctypes.c_int64,
  286. ) -> Ptr[ggml_tensor]:
  287. ...
  288. @c_fn(lib)
  289. def ggml_flatten_1d(
  290. ctx: ggml_context_p, a: Ptr[ggml_tensor], dim: int
  291. ) -> Ptr[ggml_tensor]:
  292. return a
  293. @c_fn(lib)
  294. def ggml_unflatten_1d(
  295. ctx: ggml_context_p, a: Ptr[ggml_tensor], dim: int, num_el: int
  296. ) -> Ptr[ggml_tensor]:
  297. return a
  298. @c_struct
  299. class SequenceGeneratorOptions:
  300. beam_size: int
  301. min_seq_len: int
  302. soft_max_seq_len_a: float
  303. soft_max_seq_len_b: int
  304. hard_max_seq_len: int
  305. len_penalty: float
  306. unk_penalty: float
  307. normalize_scores: bool
  308. @c_struct
  309. class SequenceGeneratorJob:
  310. opts: SequenceGeneratorOptions
  311. prefix_seq: Ptr[ggml_tensor]
  312. pad_idx: int
  313. unk_idx: int
  314. bos_idx: int
  315. eos_idx: int
  316. @c_struct
  317. class Hypothesis:
  318. seq: Ptr[ggml_tensor]
  319. """The generated sequence."""
  320. score: float
  321. """The score of the hypothesis."""
  322. step_scores: Ptr[ggml_tensor]
  323. """The score of each individual sequence step."""
  324. @c_fn(lib)
  325. def generate_sequence(
  326. model: ctypes.c_void_p,
  327. job: Ptr[SequenceGeneratorJob],
  328. encoder_output: Ptr[ggml_tensor],
  329. encoder_padding_mask: Ptr[ggml_tensor],
  330. result_ctx: ggml_context_p,
  331. ) -> Ptr[Hypothesis]:
  332. ...
  333. @c_fn(lib)
  334. def _testing_return_hypothesis_ptr(ctx: ggml_context_p) -> Ptr[Hypothesis]:
  335. return Ptr()
  336. @c_fn(lib)
  337. def fairseq2_model_layer_config_int(model: ctypes.c_void_p, name: str) -> int:
  338. pass