ggml.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486
  1. """
  2. We are vendoring https://github.com/abetlen/ggml-python (MIT License)
  3. adding a few utilities to convert between ggml and numpy tensors for testing.
  4. """
  5. import numpy as np
  6. import ctypes
  7. import torch
  8. import functools
  9. import logging
  10. from pathlib import Path
  11. from typing import Dict
  12. from typing import Callable
  13. from typing import Any
  14. from typing import Tuple
  15. from typing import Union
  16. from typing import Type
  17. from third_party_ggml import *
  18. from ctypes_utils import c_struct, c_fn, Ptr
  19. ### Helpers
  20. @functools.lru_cache(4)
  21. def numpy_dtype(ggml_type: ctypes.c_int) -> np.dtype:
  22. if ggml_type == 0:
  23. # GGML_TYPE_F32 = 0,
  24. return np.dtype(np.float32)
  25. if ggml_type == 1:
  26. # GGML_TYPE_F16 = 1,
  27. return np.dtype(np.float16)
  28. if ggml_type == 18:
  29. return np.dtype(np.int32)
  30. raise NotImplementedError(f"Can't convert GGML_TYPE({ggml_type}) to a numpy.dtype")
  31. def from_numpy_dtype(dtype: np.dtype) -> ctypes.c_int:
  32. if dtype == np.float32:
  33. return ctypes.c_int(0)
  34. elif dtype == np.int32:
  35. return ctypes.c_int(18)
  36. elif dtype == np.float16:
  37. return ctypes.c_int(1)
  38. raise NotImplementedError(f"Can't convert {dtype} to a GGML_TYPE")
  39. def shape(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  40. if isinstance(tensor, ctypes._Pointer):
  41. tensor = tensor.contents
  42. ndims = tensor.n_dims
  43. return tuple([tensor.ne[i] for i in range(ndims)[::-1]])
  44. def nb(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  45. if isinstance(tensor, ctypes._Pointer):
  46. tensor = tensor.contents
  47. return tuple([tensor.nb[i] for i in range(4)])
  48. def ne(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  49. if isinstance(tensor, ctypes._Pointer):
  50. tensor = tensor.contents
  51. return tuple([tensor.ne[i] for i in range(4)])
  52. def strides(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  53. if isinstance(tensor, ctypes._Pointer):
  54. tensor = tensor.contents
  55. ndims = tensor.n_dims
  56. num_bytes = tuple([tensor.nb[i] for i in range(ndims)])
  57. strides = num_bytes[::-1]
  58. return strides
  59. def to_numpy(tensor_p: ggml_tensor_p) -> np.ndarray:
  60. if not ggml_is_contiguous(tensor_p):
  61. if not _almost_contiguous(tensor_p):
  62. return _strided_to_numpy(tensor_p)
  63. tensor = tensor_p.contents
  64. res = _void_p_to_np_array(tensor.data, shape(tensor), numpy_dtype(tensor.type))
  65. if ggml_is_transposed(tensor_p):
  66. # Patch up strides to work with transposed ggml_tensor
  67. res.strides = strides(tensor) # type: ignore[assignment]
  68. return res
  69. def _almost_contiguous(tensor_p: ggml_tensor_p) -> bool:
  70. """Distinguishes between fully strided and just transposed."""
  71. tensor = tensor_p.contents
  72. num_bytes = nb(tensor)
  73. num_elem = ne(tensor)
  74. # Sort the axis according to 'num_bytes'
  75. nbe = sorted(zip(num_bytes, num_elem))
  76. itemsize = ggml_type_size(tensor.type)
  77. stride_exp = itemsize
  78. for stride, e in nbe:
  79. if stride != stride_exp:
  80. return False
  81. stride_exp *= e
  82. return True
  83. def _strided_to_numpy(tensor_p: ggml_tensor_p) -> np.ndarray:
  84. if ggml_is_transposed(tensor_p):
  85. raise NotImplementedError(
  86. "to_numpy doesn't support tensors both transposed and strided."
  87. )
  88. tensor = tensor_p.contents
  89. n_dim = tensor.n_dims
  90. t_shape = shape(tensor)
  91. t_strides = strides(tensor)
  92. type_size = ggml_type_size(tensor.type)
  93. full_shape = []
  94. num_bytes = nb(tensor)
  95. # Determine the full backing slice of bytes to read.
  96. # TODO make this work for transposed array
  97. n = 1
  98. total_elements = 1
  99. try:
  100. for d in range(n_dim - 1):
  101. n = num_bytes[d + 1] // type_size // n
  102. full_shape.append(n)
  103. total_elements *= n
  104. except ZeroDivisionError:
  105. logging.warning("Can't convert permuted GGML tensor back to numpy")
  106. return None
  107. # We don't need to guess for the first dimension, since this doesn't impact striding.
  108. full_shape.append(t_shape[0])
  109. total_elements *= t_shape[0]
  110. full_shape = full_shape[::-1]
  111. res = _void_p_to_np_array(tensor.data, tuple(full_shape), numpy_dtype(tensor.type))
  112. # Extract the correct slice
  113. res = res.__getitem__(tuple(slice(0, n) for n in t_shape))
  114. # TODO: we could handle transposition here
  115. return res
  116. def _void_p_to_np_array(
  117. data: ctypes.c_void_p, shape: Tuple[int, ...], dtype: np.dtype
  118. ) -> np.ndarray:
  119. # Convert the ggml data pointer to a pointer of bytes
  120. # This is needed because Python ctypes doesn't have "float16", and `as_array` only works with ctypes
  121. int_width: type = getattr(ctypes, f"c_uint{8 * dtype.itemsize}")
  122. ptr = ctypes.cast(data, ctypes.POINTER(int_width))
  123. # Create a numpy array with the wrong dtype
  124. int_arr = np.ctypeslib.as_array(ptr, shape=shape)
  125. # Reinterpret it to the right dtype
  126. return np.frombuffer(int_arr, dtype=dtype).reshape(shape)
  127. GgmlNElem = ctypes.c_int64 * GGML_MAX_DIMS
  128. GgmlNBytes = ctypes.c_uint64 * GGML_MAX_DIMS
  129. def from_file(
  130. ctx: ggml_context_p, file: Path, shape: Tuple[int, ...], dtype: type = np.float32
  131. ) -> ggml_tensor_p:
  132. data = np.fromfile(str(file), dtype=dtype).reshape(shape) # type: ignore
  133. return from_numpy(ctx, data)
  134. def _shape_to_ne(shape: Tuple[int, ...]) -> Tuple[int, int, int, int]:
  135. # in GGML ne[0] indicates the contiguous dimension, ie the last one in numpy and torch
  136. ne = shape[::-1]
  137. if len(ne) >= GGML_MAX_DIMS:
  138. return ne # type: ignore
  139. # ne is always of the same length
  140. padding = (1,) * (GGML_MAX_DIMS - len(ne))
  141. return ne + padding # type: ignore
  142. def _compute_nbytes(
  143. ne: Tuple[int, int, int, int], type: ctypes.c_int
  144. ) -> Tuple[int, int, int, int]:
  145. nb0 = ggml_type_size(type)
  146. nb1 = nb0 * (ne[0] // ggml_blck_size(type))
  147. nb2 = nb1 * ne[1]
  148. nb3 = nb2 * ne[2]
  149. return (nb0, nb1, nb2, nb3)
  150. def from_numpy(
  151. ctx: ggml_context_p, array: Union[np.ndarray, "torch.Tensor"], name: bytes = b""
  152. ) -> ggml_tensor_p:
  153. if type(array).__name__ == "Tensor":
  154. array = array.numpy()
  155. # Create an empty tensor so we don't allocate memory for the data pointer
  156. gtype = from_numpy_dtype(array.dtype)
  157. tensor_p = ggml_new_tensor_1d(ctx, gtype, 0)
  158. # Fill out the correct dimensions and shape.
  159. tensor_p.contents.n_dims = array.ndim
  160. ne = _shape_to_ne(array.shape)
  161. tensor_p.contents.ne = GgmlNElem(*ne)
  162. tensor_p.contents.nb = GgmlNBytes(*_compute_nbytes(ne, gtype))
  163. # point the tensor data to the content of the numpy array.
  164. tensor_p.contents.data = array.ctypes.data_as(ctypes.c_void_p)
  165. # print(f"array: {array.shape} @0x{array.ctypes.data_as(ctypes.c_void_p)}")
  166. # print(f"tensor_p: {shape(tensor_p)} @0x{tensor_p.contents.data:x}")
  167. # prevent the underlying numpy array to be freed
  168. setattr(tensor_p, "__data", array)
  169. if name:
  170. ggml_set_name(tensor_p, name)
  171. return tensor_p
  172. def ggml_can_mul_mat(t0: ggml_tensor_p, t1: ggml_tensor_p) -> bool:
  173. assert GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"
  174. return (
  175. (t0.contents.ne[0] == t1.contents.ne[0])
  176. and (t1.contents.ne[2] % t0.contents.ne[2] == 0)
  177. and (t1.contents.ne[3] % t0.contents.ne[3] == 0)
  178. )
  179. def nodes(gf: ggml_cgraph) -> Dict[bytes, ggml_tensor_p]:
  180. res = {}
  181. for i in range(gf.n_nodes):
  182. name = gf.nodes[i].contents.name
  183. res[name] = gf.nodes[i]
  184. return res
  185. def leafs(gf: ggml_cgraph) -> Dict[bytes, ggml_tensor_p]:
  186. res = {}
  187. for i in range(gf.n_leafs):
  188. name = gf.leafs[i].contents.name
  189. res[name] = gf.leafs[i]
  190. return res
  191. class NativeObj:
  192. AllocFn = Callable[[], ctypes.c_void_p]
  193. FreeFn = Callable[[ctypes.c_void_p], None]
  194. _cache: Dict[str, Tuple[AllocFn, FreeFn]] = {}
  195. @classmethod
  196. def _init_c_func(cls, kind: str) -> Tuple[AllocFn, FreeFn]:
  197. if kind in cls._cache:
  198. return cls._cache[kind]
  199. alloc_fn = getattr(lib, f"{kind}_alloc")
  200. alloc_fn.argtypes = []
  201. alloc_fn.restype = ctypes.c_void_p
  202. free_fn = getattr(lib, f"{kind}_free")
  203. free_fn.argtypes = [ctypes.c_void_p]
  204. free_fn.restype = None
  205. cls._cache[kind] = (alloc_fn, free_fn)
  206. return (alloc_fn, free_fn)
  207. def __init__(self, kind: str, ptr: ctypes.c_void_p = NULL):
  208. self.kind = kind
  209. alloc_fn, self._free_fn = self._init_c_func(kind)
  210. self.ptr = alloc_fn() if ptr is None else ptr
  211. # print(self)
  212. def free(self) -> None:
  213. if self.ptr is not None:
  214. self._free_fn(self.ptr)
  215. # print(f"freeing {self}")
  216. self.ptr = NULL
  217. def __enter__(self) -> ctypes.c_void_p:
  218. return self.ptr
  219. def __exit__(self, *args: Any) -> None:
  220. self.free()
  221. def __del__(self) -> None:
  222. self.free()
  223. def __repr__(self) -> str:
  224. return f"<{self.kind} native object at 0x{self.ptr:x}>"
  225. def MeasureArena() -> NativeObj:
  226. return NativeObj("ggml_allocr", ggml_allocr_new_measure(GGML_MEM_ALIGN))
  227. def FixedSizeArena(mem_size: int) -> NativeObj:
  228. memory = torch.zeros(mem_size, dtype=torch.uint8)
  229. allocr = ggml_allocr_new(
  230. ctypes.c_void_p(memory.data_ptr()), mem_size, GGML_MEM_ALIGN
  231. )
  232. arena = NativeObj("ggml_allocr", allocr)
  233. # Add a reference from the arena object to the underlying tensor, otherwise it will be freed to early.
  234. setattr(arena, "__memory", memory)
  235. return arena
  236. lib.fairseq2_model_set_inference_ctx.argtypes = [ctypes.c_void_p, ggml_context_p]
  237. def Fairseq2Model() -> NativeObj:
  238. return NativeObj("fairseq2_model")
  239. lib.std_string_alloc.argtypes = [ctypes.c_char_p]
  240. lib.std_string_alloc.restype = ctypes.c_void_p
  241. lib.std_string_free.argtypes = [ctypes.c_void_p]
  242. lib.std_string_free.restype = None
  243. NativeObj._cache["std_string"] = (lib.std_string_alloc, lib.std_string_free)
  244. def CppStr(content: str) -> NativeObj:
  245. c_str = ctypes.create_string_buffer(content.encode("utf-8"))
  246. cpp_str = lib.std_string_alloc(c_str)
  247. return NativeObj("std_string", cpp_str)
  248. lib.load_fairseq2_ggml_file.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
  249. lib.load_fairseq2_ggml_file.restype = ctypes.c_int
  250. def load_fairseq2_ggml_file(model_file: Path) -> NativeObj:
  251. model = Fairseq2Model()
  252. bytes_file = ctypes.create_string_buffer(str(model_file).encode("utf-8"))
  253. err = lib.load_fairseq2_ggml_file(model.ptr, bytes_file)
  254. if err:
  255. raise Exception("Failed to load model")
  256. return model
  257. # lib.unity_audio_encoder_graph.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
  258. # lib.unity_audio_encoder_graph.restype = ctypes.POINTER(ggml_cgraph)
  259. # def unity_audio_encoder_graph(model: NativeObj, tensor: ggml_tensor_p) -> ggml_cgraph_p:
  260. # return lib.unity_audio_encoder_graph(model.ptr, tensor) # type: ignore
  261. # lib.unity_eval.argtypes = [
  262. # ctypes.c_void_p,
  263. # ctypes.c_void_p,
  264. # ctypes.POINTER(ggml_tensor),
  265. # ctypes.c_int,
  266. # ]
  267. # lib.unity_eval.restype = ctypes.POINTER(ggml_cgraph)
  268. # def unity_eval(
  269. # allocr: ctypes.c_void_p, model: NativeObj, tensor: ggml_tensor_p, n_threads: int
  270. # ) -> ggml_cgraph_p:
  271. # return lib.unity_eval(allocr, model.ptr, tensor, n_threads)
  272. _FORWARD_CACHE: Dict[str, Callable[..., ggml_tensor_p]] = {}
  273. def forward(
  274. layer_name: str, model: ctypes.c_void_p, prefix: str, *inputs: ggml_tensor_p
  275. ) -> ggml_tensor_p:
  276. fwd: Any = _FORWARD_CACHE.get(layer_name)
  277. if fwd is None:
  278. fwd = getattr(lib, layer_name + "_forward")
  279. num_inputs = len(inputs)
  280. fwd.argtypes = [ctypes.c_void_p, ctypes.c_void_p] + [
  281. ctypes.POINTER(ggml_tensor)
  282. ] * num_inputs
  283. fwd.restype = ctypes.POINTER(ggml_tensor)
  284. _FORWARD_CACHE[layer_name] = fwd
  285. with CppStr(prefix) as std_prefix:
  286. return fwd(model, std_prefix, *inputs) # ignore: type[no-any-return]
  287. def build_and_compute(
  288. ctx: ggml_context_p, tensor: ggml_tensor_p, num_threads: int = 1
  289. ) -> None:
  290. gf = ggml_build_forward(tensor)
  291. ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), num_threads)
  292. @c_fn(lib)
  293. def causal_attention_mask(
  294. ctx: ggml_context_p, seqs: Ptr[ggml_tensor]
  295. ) -> Ptr[ggml_tensor]:
  296. ...
  297. @c_fn(lib)
  298. def ggml_slice(
  299. ctx: ggml_context_p,
  300. a: Ptr[ggml_tensor],
  301. axis: int,
  302. start: ctypes.c_int64,
  303. end: ctypes.c_int64,
  304. ) -> Ptr[ggml_tensor]:
  305. ...
  306. @c_fn(lib)
  307. def ggml_flatten_1d(
  308. ctx: ggml_context_p, a: Ptr[ggml_tensor], dim: int
  309. ) -> Ptr[ggml_tensor]:
  310. return a
  311. @c_fn(lib)
  312. def ggml_unflatten_1d(
  313. ctx: ggml_context_p, a: Ptr[ggml_tensor], dim: int, num_el: int
  314. ) -> Ptr[ggml_tensor]:
  315. return a
  316. @c_struct
  317. class SequenceGeneratorOptions:
  318. beam_size: int
  319. min_seq_len: int
  320. soft_max_seq_len_a: float
  321. soft_max_seq_len_b: int
  322. hard_max_seq_len: int
  323. len_penalty: float
  324. unk_penalty: float
  325. normalize_scores: bool
  326. @c_struct
  327. class SequenceGeneratorJob:
  328. opts: SequenceGeneratorOptions
  329. prefix_seq: Ptr[ggml_tensor]
  330. pad_idx: int
  331. unk_idx: int
  332. bos_idx: int
  333. eos_idx: int
  334. @c_struct
  335. class Hypothesis:
  336. seq: Ptr[ggml_tensor]
  337. """The generated sequence."""
  338. score: float
  339. """The score of the hypothesis."""
  340. step_scores: Ptr[ggml_tensor]
  341. """The score of each individual sequence step."""
  342. @c_fn(lib)
  343. def generate_sequence(
  344. model: ctypes.c_void_p,
  345. job: Ptr[SequenceGeneratorJob],
  346. encoder_output: Ptr[ggml_tensor],
  347. encoder_padding_mask: Ptr[ggml_tensor],
  348. result_ctx: ggml_context_p,
  349. ) -> Ptr[Hypothesis]:
  350. ...
  351. @c_fn(lib)
  352. def _testing_return_hypothesis_ptr(ctx: ggml_context_p) -> Ptr[Hypothesis]:
  353. return Ptr()
  354. @c_fn(lib)
  355. def fairseq2_model_layer_config_int(model: ctypes.c_void_p, name: str) -> int:
  356. return -1
  357. @c_fn(lib)
  358. def fairseq2_kv_cache_alloc(
  359. model: ctypes.c_void_p, beam_size: int, max_seq_len: int
  360. ) -> None:
  361. pass