ggml.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530
  1. """
  2. We are vendoring https://github.com/abetlen/ggml-python (MIT License)
  3. adding a few utilities to convert between ggml and numpy tensors for testing.
  4. """
  5. import contextlib
  6. import ctypes
  7. import dataclasses
  8. import functools
  9. import logging
  10. from pathlib import Path
  11. from typing import Any, Callable, Dict, Iterator, NamedTuple, Tuple, Type, Union
  12. import numpy as np
  13. import torch
  14. from ctypes_utils import NULLPTR, Ptr, c_fn, c_struct
  15. from third_party_ggml import *
  16. ### Helpers
  17. @functools.lru_cache(4)
  18. def numpy_dtype(ggml_type: ctypes.c_int) -> np.dtype:
  19. if ggml_type == 0:
  20. # GGML_TYPE_F32 = 0,
  21. return np.dtype(np.float32)
  22. if ggml_type == 1:
  23. # GGML_TYPE_F16 = 1,
  24. return np.dtype(np.float16)
  25. if ggml_type == 18:
  26. return np.dtype(np.int32)
  27. raise NotImplementedError(f"Can't convert GGML_TYPE({ggml_type}) to a numpy.dtype")
  28. @functools.lru_cache()
  29. def from_numpy_dtype(dtype: np.dtype) -> ctypes.c_int:
  30. def _ggml_type(name: bytes, value: int) -> ctypes.c_int:
  31. t = ctypes.c_int(value)
  32. type_name = ggml_type_name(t)
  33. if name != type_name:
  34. raise RuntimeError(
  35. f"Type {name!r} doesn't have value {value}. ggml.h was probably updated but not ggml.py"
  36. )
  37. return t
  38. if dtype == np.float32:
  39. return _ggml_type(b"f32", 0)
  40. elif dtype == np.float16:
  41. return _ggml_type(b"f16", 1)
  42. elif dtype == np.dtype("bool"):
  43. return _ggml_type(b"i8", 16)
  44. elif dtype == np.int32:
  45. return _ggml_type(b"i32", 18)
  46. raise NotImplementedError(f"Can't convert {dtype} to a GGML_TYPE")
  47. def shape(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  48. if isinstance(tensor, ctypes._Pointer):
  49. tensor = tensor.contents
  50. ndims = tensor.n_dims
  51. return tuple([tensor.ne[i] for i in range(ndims)[::-1]])
  52. def nb(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  53. if isinstance(tensor, ctypes._Pointer):
  54. tensor = tensor.contents
  55. return tuple([tensor.nb[i] for i in range(4)])
  56. def ne(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  57. if isinstance(tensor, ctypes._Pointer):
  58. tensor = tensor.contents
  59. return tuple([tensor.ne[i] for i in range(4)])
  60. def strides(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  61. if isinstance(tensor, ctypes._Pointer):
  62. tensor = tensor.contents
  63. ndims = tensor.n_dims
  64. num_bytes = tuple([tensor.nb[i] for i in range(ndims)])
  65. strides = num_bytes[::-1]
  66. return strides
  67. def to_numpy(tensor_p: ggml_tensor_p) -> np.ndarray:
  68. if not ggml_is_contiguous(tensor_p):
  69. if not _almost_contiguous(tensor_p):
  70. return _strided_to_numpy(tensor_p)
  71. tensor = tensor_p.contents
  72. res = _void_p_to_np_array(tensor.data, shape(tensor), numpy_dtype(tensor.type))
  73. if ggml_is_transposed(tensor_p):
  74. # Patch up strides to work with transposed ggml_tensor
  75. res.strides = strides(tensor) # type: ignore[assignment]
  76. return res
  77. def _almost_contiguous(tensor_p: ggml_tensor_p) -> bool:
  78. """Distinguishes between fully strided and just transposed."""
  79. tensor = tensor_p.contents
  80. num_bytes = nb(tensor)
  81. num_elem = ne(tensor)
  82. # Sort the axis according to 'num_bytes'
  83. nbe = sorted(zip(num_bytes, num_elem))
  84. itemsize = ggml_type_size(tensor.type)
  85. stride_exp = itemsize
  86. for stride, e in nbe:
  87. if stride != stride_exp:
  88. return False
  89. stride_exp *= e
  90. return True
  91. def _strided_to_numpy(tensor_p: ggml_tensor_p) -> np.ndarray:
  92. if ggml_is_transposed(tensor_p):
  93. raise NotImplementedError(
  94. "to_numpy doesn't support tensors both transposed and strided."
  95. )
  96. tensor = tensor_p.contents
  97. n_dim = tensor.n_dims
  98. t_shape = shape(tensor)
  99. t_strides = strides(tensor)
  100. type_size = ggml_type_size(tensor.type)
  101. full_shape = []
  102. num_bytes = nb(tensor)
  103. # Determine the full backing slice of bytes to read.
  104. # TODO make this work for transposed array
  105. n = 1
  106. total_elements = 1
  107. try:
  108. for d in range(n_dim - 1):
  109. n = num_bytes[d + 1] // type_size // n
  110. full_shape.append(n)
  111. total_elements *= n
  112. except ZeroDivisionError:
  113. logging.warning("Can't convert permuted GGML tensor back to numpy")
  114. return None
  115. # We don't need to guess for the first dimension, since this doesn't impact striding.
  116. full_shape.append(t_shape[0])
  117. total_elements *= t_shape[0]
  118. full_shape = full_shape[::-1]
  119. res = _void_p_to_np_array(tensor.data, tuple(full_shape), numpy_dtype(tensor.type))
  120. # Extract the correct slice
  121. res = res.__getitem__(tuple(slice(0, n) for n in t_shape))
  122. # TODO: we could handle transposition here
  123. return res
  124. def _void_p_to_np_array(
  125. data: ctypes.c_void_p, shape: Tuple[int, ...], dtype: np.dtype
  126. ) -> np.ndarray:
  127. # Convert the ggml data pointer to a pointer of bytes
  128. # This is needed because Python ctypes doesn't have "float16", and `as_array` only works with ctypes
  129. int_width: type = getattr(ctypes, f"c_uint{8 * dtype.itemsize}")
  130. ptr = ctypes.cast(data, ctypes.POINTER(int_width))
  131. # Create a numpy array with the wrong dtype
  132. int_arr = np.ctypeslib.as_array(ptr, shape=shape)
  133. # Reinterpret it to the right dtype
  134. return np.frombuffer(int_arr, dtype=dtype).reshape(shape)
  135. GgmlNElem = ctypes.c_int64 * GGML_MAX_DIMS
  136. GgmlNBytes = ctypes.c_uint64 * GGML_MAX_DIMS
  137. def from_file(
  138. ctx: ggml_context_p, file: Path, shape: Tuple[int, ...], dtype: type = np.float32
  139. ) -> ggml_tensor_p:
  140. data = np.fromfile(str(file), dtype=dtype).reshape(shape) # type: ignore
  141. return from_numpy(ctx, data)
  142. def _shape_to_ne(shape: Tuple[int, ...]) -> Tuple[int, int, int, int]:
  143. # in GGML ne[0] indicates the contiguous dimension, ie the last one in numpy and torch
  144. ne = shape[::-1]
  145. if len(ne) >= GGML_MAX_DIMS:
  146. return ne # type: ignore
  147. # ne is always of the same length
  148. padding = (1,) * (GGML_MAX_DIMS - len(ne))
  149. return ne + padding # type: ignore
  150. def _compute_nbytes(
  151. ne: Tuple[int, int, int, int], type: ctypes.c_int
  152. ) -> Tuple[int, int, int, int]:
  153. nb0 = ggml_type_size(type)
  154. nb1 = nb0 * (ne[0] // ggml_blck_size(type))
  155. nb2 = nb1 * ne[1]
  156. nb3 = nb2 * ne[2]
  157. return (nb0, nb1, nb2, nb3)
  158. def from_numpy(
  159. ctx: ggml_context_p, array: Union[np.ndarray, "torch.Tensor"], name: bytes = b""
  160. ) -> Ptr[ggml_tensor]:
  161. if type(array).__name__ == "Tensor":
  162. array = array.numpy()
  163. # Create an empty tensor so we don't allocate memory for the data pointer
  164. gtype = from_numpy_dtype(array.dtype)
  165. tensor_p = ggml_new_tensor_1d(ctx, gtype, 0)
  166. # Fill out the correct dimensions and shape.
  167. tensor_p.contents.n_dims = array.ndim
  168. ne = _shape_to_ne(array.shape)
  169. tensor_p.contents.ne = GgmlNElem(*ne)
  170. tensor_p.contents.nb = GgmlNBytes(*_compute_nbytes(ne, gtype))
  171. # point the tensor data to the content of the numpy array.
  172. tensor_p.contents.data = array.ctypes.data_as(ctypes.c_void_p)
  173. # print(f"array: {array.shape} @0x{array.ctypes.data_as(ctypes.c_void_p)}")
  174. # print(f"tensor_p: {shape(tensor_p)} @0x{tensor_p.contents.data:x}")
  175. # prevent the underlying numpy array to be freed
  176. setattr(tensor_p, "__data", array)
  177. if name:
  178. ggml_set_name(tensor_p, name)
  179. return tensor_p # type: ignore
  180. def ggml_can_mul_mat(t0: ggml_tensor_p, t1: ggml_tensor_p) -> bool:
  181. assert GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"
  182. return (
  183. (t0.contents.ne[0] == t1.contents.ne[0])
  184. and (t1.contents.ne[2] % t0.contents.ne[2] == 0)
  185. and (t1.contents.ne[3] % t0.contents.ne[3] == 0)
  186. )
  187. def nodes(gf: ggml_cgraph) -> Dict[bytes, ggml_tensor_p]:
  188. res = {}
  189. for i in range(gf.n_nodes):
  190. name = gf.nodes[i].contents.name
  191. res[name] = gf.nodes[i]
  192. return res
  193. def leafs(gf: ggml_cgraph) -> Dict[bytes, ggml_tensor_p]:
  194. res = {}
  195. for i in range(gf.n_leafs):
  196. name = gf.leafs[i].contents.name
  197. res[name] = gf.leafs[i]
  198. return res
  199. class NativeObj:
  200. AllocFn = Callable[[], ctypes.c_void_p]
  201. FreeFn = Callable[[ctypes.c_void_p], None]
  202. _cache: Dict[str, Tuple[AllocFn, FreeFn]] = {}
  203. @classmethod
  204. def _init_c_func(cls, kind: str) -> Tuple[AllocFn, FreeFn]:
  205. if kind in cls._cache:
  206. return cls._cache[kind]
  207. alloc_fn = getattr(lib, f"{kind}_alloc")
  208. alloc_fn.argtypes = []
  209. alloc_fn.restype = ctypes.c_void_p
  210. free_fn = getattr(lib, f"{kind}_free")
  211. free_fn.argtypes = [ctypes.c_void_p]
  212. free_fn.restype = None
  213. cls._cache[kind] = (alloc_fn, free_fn)
  214. return (alloc_fn, free_fn)
  215. def __init__(self, kind: str, ptr: ctypes.c_void_p = NULL):
  216. self.kind = kind
  217. alloc_fn, self._free_fn = self._init_c_func(kind)
  218. self.ptr = alloc_fn() if ptr is None else ptr
  219. # print(self)
  220. def free(self) -> None:
  221. if self.ptr is not None:
  222. self._free_fn(self.ptr)
  223. # print(f"freeing {self}")
  224. self.ptr = NULL
  225. def __enter__(self) -> ctypes.c_void_p:
  226. return self.ptr
  227. def __exit__(self, *args: Any) -> None:
  228. self.free()
  229. def __del__(self) -> None:
  230. self.free()
  231. def __repr__(self) -> str:
  232. return f"<{self.kind} native object at 0x{self.ptr:x}>"
  233. def MeasureArena() -> NativeObj:
  234. return NativeObj("ggml_allocr", ggml_allocr_new_measure(GGML_MEM_ALIGN))
  235. def FixedSizeArena(mem_size: int) -> NativeObj:
  236. memory = torch.zeros(mem_size, dtype=torch.uint8)
  237. allocr = ggml_allocr_new(
  238. ctypes.c_void_p(memory.data_ptr()), mem_size, GGML_MEM_ALIGN
  239. )
  240. arena = NativeObj("ggml_allocr", allocr)
  241. # Add a reference from the arena object to the underlying tensor, otherwise it will be freed to early.
  242. setattr(arena, "__memory", memory)
  243. return arena
  244. lib.fairseq2_model_set_inference_ctx.argtypes = [ctypes.c_void_p, ggml_context_p]
  245. def Fairseq2Model() -> NativeObj:
  246. return NativeObj("fairseq2_model")
  247. lib.std_string_alloc.argtypes = [ctypes.c_char_p]
  248. lib.std_string_alloc.restype = ctypes.c_void_p
  249. lib.std_string_free.argtypes = [ctypes.c_void_p]
  250. lib.std_string_free.restype = None
  251. NativeObj._cache["std_string"] = (lib.std_string_alloc, lib.std_string_free)
  252. def CppStr(content: str) -> NativeObj:
  253. c_str = ctypes.create_string_buffer(content.encode("utf-8"))
  254. cpp_str = lib.std_string_alloc(c_str)
  255. return NativeObj("std_string", cpp_str)
  256. lib.load_fairseq2_ggml_file.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
  257. lib.load_fairseq2_ggml_file.restype = ctypes.c_int
  258. def load_fairseq2_ggml_file(model_file: Path) -> NativeObj:
  259. model = Fairseq2Model()
  260. bytes_file = ctypes.create_string_buffer(str(model_file).encode("utf-8"))
  261. err = lib.load_fairseq2_ggml_file(model.ptr, bytes_file)
  262. if err:
  263. raise Exception("Failed to load model")
  264. return model
  265. # lib.unity_audio_encoder_graph.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
  266. # lib.unity_audio_encoder_graph.restype = ctypes.POINTER(ggml_cgraph)
  267. # def unity_audio_encoder_graph(model: NativeObj, tensor: ggml_tensor_p) -> ggml_cgraph_p:
  268. # return lib.unity_audio_encoder_graph(model.ptr, tensor) # type: ignore
  269. # lib.unity_eval.argtypes = [
  270. # ctypes.c_void_p,
  271. # ctypes.c_void_p,
  272. # ctypes.POINTER(ggml_tensor),
  273. # ctypes.c_int,
  274. # ]
  275. # lib.unity_eval.restype = ctypes.POINTER(ggml_cgraph)
  276. # def unity_eval(
  277. # allocr: ctypes.c_void_p, model: NativeObj, tensor: ggml_tensor_p, n_threads: int
  278. # ) -> ggml_cgraph_p:
  279. # return lib.unity_eval(allocr, model.ptr, tensor, n_threads)
  280. _FORWARD_CACHE: Dict[str, Callable[..., ggml_tensor_p]] = {}
  281. def forward(
  282. layer_name: str, model: ctypes.c_void_p, prefix: str, *inputs: ggml_tensor_p
  283. ) -> ggml_tensor_p:
  284. fwd: Any = _FORWARD_CACHE.get(layer_name)
  285. if fwd is None:
  286. fwd = getattr(lib, layer_name + "_forward")
  287. num_inputs = len(inputs)
  288. fwd.argtypes = [ctypes.c_void_p, ctypes.c_void_p] + [
  289. ctypes.POINTER(ggml_tensor)
  290. ] * num_inputs
  291. fwd.restype = ctypes.POINTER(ggml_tensor)
  292. _FORWARD_CACHE[layer_name] = fwd
  293. with CppStr(prefix) as std_prefix:
  294. return fwd(model, std_prefix, *inputs) # ignore: type[no-any-return]
  295. def build_and_compute(
  296. ctx: ggml_context_p, tensor: ggml_tensor_p, num_threads: int = 1
  297. ) -> None:
  298. gf = ggml_build_forward(tensor)
  299. ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), num_threads)
  300. @c_fn(lib)
  301. def causal_attention_mask(
  302. ctx: ggml_context_p, seqs: Ptr[ggml_tensor]
  303. ) -> Ptr[ggml_tensor]:
  304. ...
  305. @c_fn(lib)
  306. def ggml_slice(
  307. ctx: ggml_context_p,
  308. a: Ptr[ggml_tensor],
  309. axis: int,
  310. start: ctypes.c_int64,
  311. end: ctypes.c_int64,
  312. ) -> Ptr[ggml_tensor]:
  313. ...
  314. @c_fn(lib)
  315. def ggml_flatten_1d(
  316. ctx: ggml_context_p, a: Ptr[ggml_tensor], dim: int
  317. ) -> Ptr[ggml_tensor]:
  318. return a
  319. @c_fn(lib)
  320. def ggml_unflatten_1d(
  321. ctx: ggml_context_p, a: Ptr[ggml_tensor], dim: int, num_el: int
  322. ) -> Ptr[ggml_tensor]:
  323. return a
  324. @c_struct
  325. @dataclasses.dataclass
  326. class SequenceGeneratorOptions:
  327. beam_size: int
  328. min_seq_len: int = 5
  329. soft_max_seq_len_a: float = 1.0
  330. soft_max_seq_len_b: int = 200
  331. hard_max_seq_len: int = 1024
  332. len_penalty: float = 1.0
  333. unk_penalty: float = 0.0
  334. normalize_scores: bool = True
  335. @c_struct
  336. @dataclasses.dataclass
  337. class SequenceGeneratorJob:
  338. opts: SequenceGeneratorOptions
  339. prefix_seq: Ptr[ggml_tensor]
  340. pad_idx: int
  341. unk_idx: int
  342. bos_idx: int
  343. eos_idx: int
  344. num_threads: int = 1
  345. @c_struct
  346. class Hypothesis:
  347. seq: Ptr[ggml_tensor]
  348. """The generated sequence."""
  349. score: float
  350. """The score of the hypothesis."""
  351. step_scores: Ptr[ggml_tensor]
  352. """The score of each individual sequence step."""
  353. @c_fn(lib)
  354. def generate_sequence(
  355. model: ctypes.c_void_p,
  356. job: Ptr[SequenceGeneratorJob],
  357. encoder_output: Ptr[ggml_tensor],
  358. encoder_padding_mask: Ptr[ggml_tensor],
  359. result_ctx: ggml_context_p,
  360. ) -> Ptr[Hypothesis]:
  361. ...
  362. @c_fn(lib)
  363. def _testing_return_hypothesis_ptr(ctx: ggml_context_p) -> Ptr[Hypothesis]:
  364. return Ptr()
  365. @c_fn(lib)
  366. def fairseq2_model_layer_config_int(model: ctypes.c_void_p, name: bytes) -> int:
  367. return -1
  368. @c_fn(lib.fairseq2_kv_cache_alloc)
  369. def _fairseq2_kv_cache_alloc(
  370. model: ctypes.c_void_p, beam_size: int, max_seq_len: int
  371. ) -> None:
  372. pass
  373. @c_fn(lib.fairseq2_kv_cache_reset)
  374. def _fairseq2_kv_cache_reset(model: ctypes.c_void_p) -> None:
  375. pass
  376. @contextlib.contextmanager
  377. def fairseq2_kv_cache_alloc(
  378. model: ctypes.c_void_p, beam_size: int, max_seq_len: int
  379. ) -> Iterator[None]:
  380. _fairseq2_kv_cache_alloc(model, beam_size, max_seq_len)
  381. try:
  382. yield
  383. finally:
  384. _fairseq2_kv_cache_reset(model)
  385. @c_fn(lib)
  386. def fairseq2_spm_tokenize(
  387. model: ctypes.c_void_p, text: bytes, out: Ptr[ggml_tensor]
  388. ) -> None:
  389. pass
  390. @c_fn(lib)
  391. def fairseq2_spm_detokenize(
  392. model: ctypes.c_void_p, tensor: Ptr[ggml_tensor], out: ctypes.Array[ctypes.c_char]
  393. ) -> ctypes.c_size_t:
  394. return 0