ggml.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509
  1. """
  2. We are vendoring https://github.com/abetlen/ggml-python (MIT License)
  3. adding a few utilities to convert between ggml and numpy tensors for testing.
  4. """
  5. import numpy as np
  6. import ctypes
  7. import torch
  8. import functools
  9. import logging
  10. import dataclasses
  11. import contextlib
  12. from typing import Iterator
  13. from typing import NamedTuple
  14. from pathlib import Path
  15. from typing import Dict
  16. from typing import Callable
  17. from typing import Any
  18. from typing import Tuple
  19. from typing import Union
  20. from typing import Type
  21. from third_party_ggml import *
  22. from ctypes_utils import c_struct, c_fn, Ptr
  23. ### Helpers
  24. @functools.lru_cache(4)
  25. def numpy_dtype(ggml_type: ctypes.c_int) -> np.dtype:
  26. if ggml_type == 0:
  27. # GGML_TYPE_F32 = 0,
  28. return np.dtype(np.float32)
  29. if ggml_type == 1:
  30. # GGML_TYPE_F16 = 1,
  31. return np.dtype(np.float16)
  32. if ggml_type == 18:
  33. return np.dtype(np.int32)
  34. raise NotImplementedError(f"Can't convert GGML_TYPE({ggml_type}) to a numpy.dtype")
  35. def from_numpy_dtype(dtype: np.dtype) -> ctypes.c_int:
  36. if dtype == np.float32:
  37. return ctypes.c_int(0)
  38. elif dtype == np.int32:
  39. return ctypes.c_int(18)
  40. elif dtype == np.float16:
  41. return ctypes.c_int(1)
  42. raise NotImplementedError(f"Can't convert {dtype} to a GGML_TYPE")
  43. def shape(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  44. if isinstance(tensor, ctypes._Pointer):
  45. tensor = tensor.contents
  46. ndims = tensor.n_dims
  47. return tuple([tensor.ne[i] for i in range(ndims)[::-1]])
  48. def nb(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  49. if isinstance(tensor, ctypes._Pointer):
  50. tensor = tensor.contents
  51. return tuple([tensor.nb[i] for i in range(4)])
  52. def ne(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  53. if isinstance(tensor, ctypes._Pointer):
  54. tensor = tensor.contents
  55. return tuple([tensor.ne[i] for i in range(4)])
  56. def strides(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  57. if isinstance(tensor, ctypes._Pointer):
  58. tensor = tensor.contents
  59. ndims = tensor.n_dims
  60. num_bytes = tuple([tensor.nb[i] for i in range(ndims)])
  61. strides = num_bytes[::-1]
  62. return strides
  63. def to_numpy(tensor_p: ggml_tensor_p) -> np.ndarray:
  64. if not ggml_is_contiguous(tensor_p):
  65. if not _almost_contiguous(tensor_p):
  66. return _strided_to_numpy(tensor_p)
  67. tensor = tensor_p.contents
  68. res = _void_p_to_np_array(tensor.data, shape(tensor), numpy_dtype(tensor.type))
  69. if ggml_is_transposed(tensor_p):
  70. # Patch up strides to work with transposed ggml_tensor
  71. res.strides = strides(tensor) # type: ignore[assignment]
  72. return res
  73. def _almost_contiguous(tensor_p: ggml_tensor_p) -> bool:
  74. """Distinguishes between fully strided and just transposed."""
  75. tensor = tensor_p.contents
  76. num_bytes = nb(tensor)
  77. num_elem = ne(tensor)
  78. # Sort the axis according to 'num_bytes'
  79. nbe = sorted(zip(num_bytes, num_elem))
  80. itemsize = ggml_type_size(tensor.type)
  81. stride_exp = itemsize
  82. for stride, e in nbe:
  83. if stride != stride_exp:
  84. return False
  85. stride_exp *= e
  86. return True
  87. def _strided_to_numpy(tensor_p: ggml_tensor_p) -> np.ndarray:
  88. if ggml_is_transposed(tensor_p):
  89. raise NotImplementedError(
  90. "to_numpy doesn't support tensors both transposed and strided."
  91. )
  92. tensor = tensor_p.contents
  93. n_dim = tensor.n_dims
  94. t_shape = shape(tensor)
  95. t_strides = strides(tensor)
  96. type_size = ggml_type_size(tensor.type)
  97. full_shape = []
  98. num_bytes = nb(tensor)
  99. # Determine the full backing slice of bytes to read.
  100. # TODO make this work for transposed array
  101. n = 1
  102. total_elements = 1
  103. try:
  104. for d in range(n_dim - 1):
  105. n = num_bytes[d + 1] // type_size // n
  106. full_shape.append(n)
  107. total_elements *= n
  108. except ZeroDivisionError:
  109. logging.warning("Can't convert permuted GGML tensor back to numpy")
  110. return None
  111. # We don't need to guess for the first dimension, since this doesn't impact striding.
  112. full_shape.append(t_shape[0])
  113. total_elements *= t_shape[0]
  114. full_shape = full_shape[::-1]
  115. res = _void_p_to_np_array(tensor.data, tuple(full_shape), numpy_dtype(tensor.type))
  116. # Extract the correct slice
  117. res = res.__getitem__(tuple(slice(0, n) for n in t_shape))
  118. # TODO: we could handle transposition here
  119. return res
  120. def _void_p_to_np_array(
  121. data: ctypes.c_void_p, shape: Tuple[int, ...], dtype: np.dtype
  122. ) -> np.ndarray:
  123. # Convert the ggml data pointer to a pointer of bytes
  124. # This is needed because Python ctypes doesn't have "float16", and `as_array` only works with ctypes
  125. int_width: type = getattr(ctypes, f"c_uint{8 * dtype.itemsize}")
  126. ptr = ctypes.cast(data, ctypes.POINTER(int_width))
  127. # Create a numpy array with the wrong dtype
  128. int_arr = np.ctypeslib.as_array(ptr, shape=shape)
  129. # Reinterpret it to the right dtype
  130. return np.frombuffer(int_arr, dtype=dtype).reshape(shape)
  131. GgmlNElem = ctypes.c_int64 * GGML_MAX_DIMS
  132. GgmlNBytes = ctypes.c_uint64 * GGML_MAX_DIMS
  133. def from_file(
  134. ctx: ggml_context_p, file: Path, shape: Tuple[int, ...], dtype: type = np.float32
  135. ) -> ggml_tensor_p:
  136. data = np.fromfile(str(file), dtype=dtype).reshape(shape) # type: ignore
  137. return from_numpy(ctx, data)
  138. def _shape_to_ne(shape: Tuple[int, ...]) -> Tuple[int, int, int, int]:
  139. # in GGML ne[0] indicates the contiguous dimension, ie the last one in numpy and torch
  140. ne = shape[::-1]
  141. if len(ne) >= GGML_MAX_DIMS:
  142. return ne # type: ignore
  143. # ne is always of the same length
  144. padding = (1,) * (GGML_MAX_DIMS - len(ne))
  145. return ne + padding # type: ignore
  146. def _compute_nbytes(
  147. ne: Tuple[int, int, int, int], type: ctypes.c_int
  148. ) -> Tuple[int, int, int, int]:
  149. nb0 = ggml_type_size(type)
  150. nb1 = nb0 * (ne[0] // ggml_blck_size(type))
  151. nb2 = nb1 * ne[1]
  152. nb3 = nb2 * ne[2]
  153. return (nb0, nb1, nb2, nb3)
  154. def from_numpy(
  155. ctx: ggml_context_p, array: Union[np.ndarray, "torch.Tensor"], name: bytes = b""
  156. ) -> Ptr[ggml_tensor]:
  157. if type(array).__name__ == "Tensor":
  158. array = array.numpy()
  159. # Create an empty tensor so we don't allocate memory for the data pointer
  160. gtype = from_numpy_dtype(array.dtype)
  161. tensor_p = ggml_new_tensor_1d(ctx, gtype, 0)
  162. # Fill out the correct dimensions and shape.
  163. tensor_p.contents.n_dims = array.ndim
  164. ne = _shape_to_ne(array.shape)
  165. tensor_p.contents.ne = GgmlNElem(*ne)
  166. tensor_p.contents.nb = GgmlNBytes(*_compute_nbytes(ne, gtype))
  167. # point the tensor data to the content of the numpy array.
  168. tensor_p.contents.data = array.ctypes.data_as(ctypes.c_void_p)
  169. # print(f"array: {array.shape} @0x{array.ctypes.data_as(ctypes.c_void_p)}")
  170. # print(f"tensor_p: {shape(tensor_p)} @0x{tensor_p.contents.data:x}")
  171. # prevent the underlying numpy array to be freed
  172. setattr(tensor_p, "__data", array)
  173. if name:
  174. ggml_set_name(tensor_p, name)
  175. return tensor_p # type: ignore
  176. def ggml_can_mul_mat(t0: ggml_tensor_p, t1: ggml_tensor_p) -> bool:
  177. assert GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"
  178. return (
  179. (t0.contents.ne[0] == t1.contents.ne[0])
  180. and (t1.contents.ne[2] % t0.contents.ne[2] == 0)
  181. and (t1.contents.ne[3] % t0.contents.ne[3] == 0)
  182. )
  183. def nodes(gf: ggml_cgraph) -> Dict[bytes, ggml_tensor_p]:
  184. res = {}
  185. for i in range(gf.n_nodes):
  186. name = gf.nodes[i].contents.name
  187. res[name] = gf.nodes[i]
  188. return res
  189. def leafs(gf: ggml_cgraph) -> Dict[bytes, ggml_tensor_p]:
  190. res = {}
  191. for i in range(gf.n_leafs):
  192. name = gf.leafs[i].contents.name
  193. res[name] = gf.leafs[i]
  194. return res
  195. class NativeObj:
  196. AllocFn = Callable[[], ctypes.c_void_p]
  197. FreeFn = Callable[[ctypes.c_void_p], None]
  198. _cache: Dict[str, Tuple[AllocFn, FreeFn]] = {}
  199. @classmethod
  200. def _init_c_func(cls, kind: str) -> Tuple[AllocFn, FreeFn]:
  201. if kind in cls._cache:
  202. return cls._cache[kind]
  203. alloc_fn = getattr(lib, f"{kind}_alloc")
  204. alloc_fn.argtypes = []
  205. alloc_fn.restype = ctypes.c_void_p
  206. free_fn = getattr(lib, f"{kind}_free")
  207. free_fn.argtypes = [ctypes.c_void_p]
  208. free_fn.restype = None
  209. cls._cache[kind] = (alloc_fn, free_fn)
  210. return (alloc_fn, free_fn)
  211. def __init__(self, kind: str, ptr: ctypes.c_void_p = NULL):
  212. self.kind = kind
  213. alloc_fn, self._free_fn = self._init_c_func(kind)
  214. self.ptr = alloc_fn() if ptr is None else ptr
  215. # print(self)
  216. def free(self) -> None:
  217. if self.ptr is not None:
  218. self._free_fn(self.ptr)
  219. # print(f"freeing {self}")
  220. self.ptr = NULL
  221. def __enter__(self) -> ctypes.c_void_p:
  222. return self.ptr
  223. def __exit__(self, *args: Any) -> None:
  224. self.free()
  225. def __del__(self) -> None:
  226. self.free()
  227. def __repr__(self) -> str:
  228. return f"<{self.kind} native object at 0x{self.ptr:x}>"
  229. def MeasureArena() -> NativeObj:
  230. return NativeObj("ggml_allocr", ggml_allocr_new_measure(GGML_MEM_ALIGN))
  231. def FixedSizeArena(mem_size: int) -> NativeObj:
  232. memory = torch.zeros(mem_size, dtype=torch.uint8)
  233. allocr = ggml_allocr_new(
  234. ctypes.c_void_p(memory.data_ptr()), mem_size, GGML_MEM_ALIGN
  235. )
  236. arena = NativeObj("ggml_allocr", allocr)
  237. # Add a reference from the arena object to the underlying tensor, otherwise it will be freed to early.
  238. setattr(arena, "__memory", memory)
  239. return arena
  240. lib.fairseq2_model_set_inference_ctx.argtypes = [ctypes.c_void_p, ggml_context_p]
  241. def Fairseq2Model() -> NativeObj:
  242. return NativeObj("fairseq2_model")
  243. lib.std_string_alloc.argtypes = [ctypes.c_char_p]
  244. lib.std_string_alloc.restype = ctypes.c_void_p
  245. lib.std_string_free.argtypes = [ctypes.c_void_p]
  246. lib.std_string_free.restype = None
  247. NativeObj._cache["std_string"] = (lib.std_string_alloc, lib.std_string_free)
  248. def CppStr(content: str) -> NativeObj:
  249. c_str = ctypes.create_string_buffer(content.encode("utf-8"))
  250. cpp_str = lib.std_string_alloc(c_str)
  251. return NativeObj("std_string", cpp_str)
  252. lib.load_fairseq2_ggml_file.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
  253. lib.load_fairseq2_ggml_file.restype = ctypes.c_int
  254. def load_fairseq2_ggml_file(model_file: Path) -> NativeObj:
  255. model = Fairseq2Model()
  256. bytes_file = ctypes.create_string_buffer(str(model_file).encode("utf-8"))
  257. err = lib.load_fairseq2_ggml_file(model.ptr, bytes_file)
  258. if err:
  259. raise Exception("Failed to load model")
  260. return model
  261. # lib.unity_audio_encoder_graph.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
  262. # lib.unity_audio_encoder_graph.restype = ctypes.POINTER(ggml_cgraph)
  263. # def unity_audio_encoder_graph(model: NativeObj, tensor: ggml_tensor_p) -> ggml_cgraph_p:
  264. # return lib.unity_audio_encoder_graph(model.ptr, tensor) # type: ignore
  265. # lib.unity_eval.argtypes = [
  266. # ctypes.c_void_p,
  267. # ctypes.c_void_p,
  268. # ctypes.POINTER(ggml_tensor),
  269. # ctypes.c_int,
  270. # ]
  271. # lib.unity_eval.restype = ctypes.POINTER(ggml_cgraph)
  272. # def unity_eval(
  273. # allocr: ctypes.c_void_p, model: NativeObj, tensor: ggml_tensor_p, n_threads: int
  274. # ) -> ggml_cgraph_p:
  275. # return lib.unity_eval(allocr, model.ptr, tensor, n_threads)
  276. _FORWARD_CACHE: Dict[str, Callable[..., ggml_tensor_p]] = {}
  277. def forward(
  278. layer_name: str, model: ctypes.c_void_p, prefix: str, *inputs: ggml_tensor_p
  279. ) -> ggml_tensor_p:
  280. fwd: Any = _FORWARD_CACHE.get(layer_name)
  281. if fwd is None:
  282. fwd = getattr(lib, layer_name + "_forward")
  283. num_inputs = len(inputs)
  284. fwd.argtypes = [ctypes.c_void_p, ctypes.c_void_p] + [
  285. ctypes.POINTER(ggml_tensor)
  286. ] * num_inputs
  287. fwd.restype = ctypes.POINTER(ggml_tensor)
  288. _FORWARD_CACHE[layer_name] = fwd
  289. with CppStr(prefix) as std_prefix:
  290. return fwd(model, std_prefix, *inputs) # ignore: type[no-any-return]
  291. def build_and_compute(
  292. ctx: ggml_context_p, tensor: ggml_tensor_p, num_threads: int = 1
  293. ) -> None:
  294. gf = ggml_build_forward(tensor)
  295. ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), num_threads)
  296. @c_fn(lib)
  297. def causal_attention_mask(
  298. ctx: ggml_context_p, seqs: Ptr[ggml_tensor]
  299. ) -> Ptr[ggml_tensor]:
  300. ...
  301. @c_fn(lib)
  302. def ggml_slice(
  303. ctx: ggml_context_p,
  304. a: Ptr[ggml_tensor],
  305. axis: int,
  306. start: ctypes.c_int64,
  307. end: ctypes.c_int64,
  308. ) -> Ptr[ggml_tensor]:
  309. ...
  310. @c_fn(lib)
  311. def ggml_flatten_1d(
  312. ctx: ggml_context_p, a: Ptr[ggml_tensor], dim: int
  313. ) -> Ptr[ggml_tensor]:
  314. return a
  315. @c_fn(lib)
  316. def ggml_unflatten_1d(
  317. ctx: ggml_context_p, a: Ptr[ggml_tensor], dim: int, num_el: int
  318. ) -> Ptr[ggml_tensor]:
  319. return a
  320. @c_struct
  321. @dataclasses.dataclass
  322. class SequenceGeneratorOptions:
  323. beam_size: int
  324. min_seq_len: int = 5
  325. soft_max_seq_len_a: float = 1.0
  326. soft_max_seq_len_b: int = 200
  327. hard_max_seq_len: int = 1024
  328. len_penalty: float = 1.0
  329. unk_penalty: float = 0.0
  330. normalize_scores: bool = True
  331. @c_struct
  332. @dataclasses.dataclass
  333. class SequenceGeneratorJob:
  334. opts: SequenceGeneratorOptions
  335. prefix_seq: Ptr[ggml_tensor]
  336. pad_idx: int
  337. unk_idx: int
  338. bos_idx: int
  339. eos_idx: int
  340. num_threads: int = 1
  341. @c_struct
  342. class Hypothesis:
  343. seq: Ptr[ggml_tensor]
  344. """The generated sequence."""
  345. score: float
  346. """The score of the hypothesis."""
  347. step_scores: Ptr[ggml_tensor]
  348. """The score of each individual sequence step."""
  349. @c_fn(lib)
  350. def generate_sequence(
  351. model: ctypes.c_void_p,
  352. job: Ptr[SequenceGeneratorJob],
  353. encoder_output: Ptr[ggml_tensor],
  354. encoder_padding_mask: Ptr[ggml_tensor],
  355. result_ctx: ggml_context_p,
  356. ) -> Ptr[Hypothesis]:
  357. ...
  358. @c_fn(lib)
  359. def _testing_return_hypothesis_ptr(ctx: ggml_context_p) -> Ptr[Hypothesis]:
  360. return Ptr()
  361. @c_fn(lib)
  362. def fairseq2_model_layer_config_int(model: ctypes.c_void_p, name: str) -> int:
  363. return -1
  364. @c_fn(lib.fairseq2_kv_cache_alloc)
  365. def _fairseq2_kv_cache_alloc(
  366. model: ctypes.c_void_p, beam_size: int, max_seq_len: int
  367. ) -> None:
  368. pass
  369. @c_fn(lib.fairseq2_kv_cache_reset)
  370. def _fairseq2_kv_cache_reset(model: ctypes.c_void_p) -> None:
  371. pass
  372. @contextlib.contextmanager
  373. def fairseq2_kv_cache_alloc(
  374. model: ctypes.c_void_p, beam_size: int, max_seq_len: int
  375. ) -> Iterator[None]:
  376. _fairseq2_kv_cache_alloc(model, beam_size, max_seq_len)
  377. try:
  378. yield
  379. finally:
  380. _fairseq2_kv_cache_reset(model)