ggml.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523
  1. """
  2. We are vendoring https://github.com/abetlen/ggml-python (MIT License)
  3. adding a few utilities to convert between ggml and numpy tensors for testing.
  4. """
  5. import numpy as np
  6. import ctypes
  7. import torch
  8. import functools
  9. import logging
  10. import dataclasses
  11. import contextlib
  12. from typing import Iterator
  13. from typing import NamedTuple
  14. from pathlib import Path
  15. from typing import Dict
  16. from typing import Callable
  17. from typing import Any
  18. from typing import Tuple
  19. from typing import Union
  20. from typing import Type
  21. from third_party_ggml import *
  22. from ctypes_utils import c_struct, c_fn, Ptr
  23. ### Helpers
  24. @functools.lru_cache(4)
  25. def numpy_dtype(ggml_type: ctypes.c_int) -> np.dtype:
  26. if ggml_type == 0:
  27. # GGML_TYPE_F32 = 0,
  28. return np.dtype(np.float32)
  29. if ggml_type == 1:
  30. # GGML_TYPE_F16 = 1,
  31. return np.dtype(np.float16)
  32. if ggml_type == 18:
  33. return np.dtype(np.int32)
  34. raise NotImplementedError(f"Can't convert GGML_TYPE({ggml_type}) to a numpy.dtype")
  35. @functools.lru_cache()
  36. def from_numpy_dtype(dtype: np.dtype) -> ctypes.c_int:
  37. def _ggml_type(name: bytes, value: int) -> ctypes.c_int:
  38. t = ctypes.c_int(value)
  39. type_name = ggml_type_name(t)
  40. if name != type_name:
  41. raise RuntimeError(
  42. f"Type '{name}' doesn't have value {value}. ggml.h was probably updated but not ggml.py"
  43. )
  44. return t
  45. if dtype == np.float32:
  46. return _ggml_type(b"f32", 0)
  47. elif dtype == np.float16:
  48. return _ggml_type(b"f16", 1)
  49. elif dtype == np.dtype("bool"):
  50. return _ggml_type(b"i8", 16)
  51. elif dtype == np.int32:
  52. return _ggml_type(b"i32", 18)
  53. raise NotImplementedError(f"Can't convert {dtype} to a GGML_TYPE")
  54. def shape(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  55. if isinstance(tensor, ctypes._Pointer):
  56. tensor = tensor.contents
  57. ndims = tensor.n_dims
  58. return tuple([tensor.ne[i] for i in range(ndims)[::-1]])
  59. def nb(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  60. if isinstance(tensor, ctypes._Pointer):
  61. tensor = tensor.contents
  62. return tuple([tensor.nb[i] for i in range(4)])
  63. def ne(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  64. if isinstance(tensor, ctypes._Pointer):
  65. tensor = tensor.contents
  66. return tuple([tensor.ne[i] for i in range(4)])
  67. def strides(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  68. if isinstance(tensor, ctypes._Pointer):
  69. tensor = tensor.contents
  70. ndims = tensor.n_dims
  71. num_bytes = tuple([tensor.nb[i] for i in range(ndims)])
  72. strides = num_bytes[::-1]
  73. return strides
  74. def to_numpy(tensor_p: ggml_tensor_p) -> np.ndarray:
  75. if not ggml_is_contiguous(tensor_p):
  76. if not _almost_contiguous(tensor_p):
  77. return _strided_to_numpy(tensor_p)
  78. tensor = tensor_p.contents
  79. res = _void_p_to_np_array(tensor.data, shape(tensor), numpy_dtype(tensor.type))
  80. if ggml_is_transposed(tensor_p):
  81. # Patch up strides to work with transposed ggml_tensor
  82. res.strides = strides(tensor) # type: ignore[assignment]
  83. return res
  84. def _almost_contiguous(tensor_p: ggml_tensor_p) -> bool:
  85. """Distinguishes between fully strided and just transposed."""
  86. tensor = tensor_p.contents
  87. num_bytes = nb(tensor)
  88. num_elem = ne(tensor)
  89. # Sort the axis according to 'num_bytes'
  90. nbe = sorted(zip(num_bytes, num_elem))
  91. itemsize = ggml_type_size(tensor.type)
  92. stride_exp = itemsize
  93. for stride, e in nbe:
  94. if stride != stride_exp:
  95. return False
  96. stride_exp *= e
  97. return True
  98. def _strided_to_numpy(tensor_p: ggml_tensor_p) -> np.ndarray:
  99. if ggml_is_transposed(tensor_p):
  100. raise NotImplementedError(
  101. "to_numpy doesn't support tensors both transposed and strided."
  102. )
  103. tensor = tensor_p.contents
  104. n_dim = tensor.n_dims
  105. t_shape = shape(tensor)
  106. t_strides = strides(tensor)
  107. type_size = ggml_type_size(tensor.type)
  108. full_shape = []
  109. num_bytes = nb(tensor)
  110. # Determine the full backing slice of bytes to read.
  111. # TODO make this work for transposed array
  112. n = 1
  113. total_elements = 1
  114. try:
  115. for d in range(n_dim - 1):
  116. n = num_bytes[d + 1] // type_size // n
  117. full_shape.append(n)
  118. total_elements *= n
  119. except ZeroDivisionError:
  120. logging.warning("Can't convert permuted GGML tensor back to numpy")
  121. return None
  122. # We don't need to guess for the first dimension, since this doesn't impact striding.
  123. full_shape.append(t_shape[0])
  124. total_elements *= t_shape[0]
  125. full_shape = full_shape[::-1]
  126. res = _void_p_to_np_array(tensor.data, tuple(full_shape), numpy_dtype(tensor.type))
  127. # Extract the correct slice
  128. res = res.__getitem__(tuple(slice(0, n) for n in t_shape))
  129. # TODO: we could handle transposition here
  130. return res
  131. def _void_p_to_np_array(
  132. data: ctypes.c_void_p, shape: Tuple[int, ...], dtype: np.dtype
  133. ) -> np.ndarray:
  134. # Convert the ggml data pointer to a pointer of bytes
  135. # This is needed because Python ctypes doesn't have "float16", and `as_array` only works with ctypes
  136. int_width: type = getattr(ctypes, f"c_uint{8 * dtype.itemsize}")
  137. ptr = ctypes.cast(data, ctypes.POINTER(int_width))
  138. # Create a numpy array with the wrong dtype
  139. int_arr = np.ctypeslib.as_array(ptr, shape=shape)
  140. # Reinterpret it to the right dtype
  141. return np.frombuffer(int_arr, dtype=dtype).reshape(shape)
  142. GgmlNElem = ctypes.c_int64 * GGML_MAX_DIMS
  143. GgmlNBytes = ctypes.c_uint64 * GGML_MAX_DIMS
  144. def from_file(
  145. ctx: ggml_context_p, file: Path, shape: Tuple[int, ...], dtype: type = np.float32
  146. ) -> ggml_tensor_p:
  147. data = np.fromfile(str(file), dtype=dtype).reshape(shape) # type: ignore
  148. return from_numpy(ctx, data)
  149. def _shape_to_ne(shape: Tuple[int, ...]) -> Tuple[int, int, int, int]:
  150. # in GGML ne[0] indicates the contiguous dimension, ie the last one in numpy and torch
  151. ne = shape[::-1]
  152. if len(ne) >= GGML_MAX_DIMS:
  153. return ne # type: ignore
  154. # ne is always of the same length
  155. padding = (1,) * (GGML_MAX_DIMS - len(ne))
  156. return ne + padding # type: ignore
  157. def _compute_nbytes(
  158. ne: Tuple[int, int, int, int], type: ctypes.c_int
  159. ) -> Tuple[int, int, int, int]:
  160. nb0 = ggml_type_size(type)
  161. nb1 = nb0 * (ne[0] // ggml_blck_size(type))
  162. nb2 = nb1 * ne[1]
  163. nb3 = nb2 * ne[2]
  164. return (nb0, nb1, nb2, nb3)
  165. def from_numpy(
  166. ctx: ggml_context_p, array: Union[np.ndarray, "torch.Tensor"], name: bytes = b""
  167. ) -> Ptr[ggml_tensor]:
  168. if type(array).__name__ == "Tensor":
  169. array = array.numpy()
  170. # Create an empty tensor so we don't allocate memory for the data pointer
  171. gtype = from_numpy_dtype(array.dtype)
  172. tensor_p = ggml_new_tensor_1d(ctx, gtype, 0)
  173. # Fill out the correct dimensions and shape.
  174. tensor_p.contents.n_dims = array.ndim
  175. ne = _shape_to_ne(array.shape)
  176. tensor_p.contents.ne = GgmlNElem(*ne)
  177. tensor_p.contents.nb = GgmlNBytes(*_compute_nbytes(ne, gtype))
  178. # point the tensor data to the content of the numpy array.
  179. tensor_p.contents.data = array.ctypes.data_as(ctypes.c_void_p)
  180. # print(f"array: {array.shape} @0x{array.ctypes.data_as(ctypes.c_void_p)}")
  181. # print(f"tensor_p: {shape(tensor_p)} @0x{tensor_p.contents.data:x}")
  182. # prevent the underlying numpy array to be freed
  183. setattr(tensor_p, "__data", array)
  184. if name:
  185. ggml_set_name(tensor_p, name)
  186. return tensor_p # type: ignore
  187. def ggml_can_mul_mat(t0: ggml_tensor_p, t1: ggml_tensor_p) -> bool:
  188. assert GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"
  189. return (
  190. (t0.contents.ne[0] == t1.contents.ne[0])
  191. and (t1.contents.ne[2] % t0.contents.ne[2] == 0)
  192. and (t1.contents.ne[3] % t0.contents.ne[3] == 0)
  193. )
  194. def nodes(gf: ggml_cgraph) -> Dict[bytes, ggml_tensor_p]:
  195. res = {}
  196. for i in range(gf.n_nodes):
  197. name = gf.nodes[i].contents.name
  198. res[name] = gf.nodes[i]
  199. return res
  200. def leafs(gf: ggml_cgraph) -> Dict[bytes, ggml_tensor_p]:
  201. res = {}
  202. for i in range(gf.n_leafs):
  203. name = gf.leafs[i].contents.name
  204. res[name] = gf.leafs[i]
  205. return res
  206. class NativeObj:
  207. AllocFn = Callable[[], ctypes.c_void_p]
  208. FreeFn = Callable[[ctypes.c_void_p], None]
  209. _cache: Dict[str, Tuple[AllocFn, FreeFn]] = {}
  210. @classmethod
  211. def _init_c_func(cls, kind: str) -> Tuple[AllocFn, FreeFn]:
  212. if kind in cls._cache:
  213. return cls._cache[kind]
  214. alloc_fn = getattr(lib, f"{kind}_alloc")
  215. alloc_fn.argtypes = []
  216. alloc_fn.restype = ctypes.c_void_p
  217. free_fn = getattr(lib, f"{kind}_free")
  218. free_fn.argtypes = [ctypes.c_void_p]
  219. free_fn.restype = None
  220. cls._cache[kind] = (alloc_fn, free_fn)
  221. return (alloc_fn, free_fn)
  222. def __init__(self, kind: str, ptr: ctypes.c_void_p = NULL):
  223. self.kind = kind
  224. alloc_fn, self._free_fn = self._init_c_func(kind)
  225. self.ptr = alloc_fn() if ptr is None else ptr
  226. # print(self)
  227. def free(self) -> None:
  228. if self.ptr is not None:
  229. self._free_fn(self.ptr)
  230. # print(f"freeing {self}")
  231. self.ptr = NULL
  232. def __enter__(self) -> ctypes.c_void_p:
  233. return self.ptr
  234. def __exit__(self, *args: Any) -> None:
  235. self.free()
  236. def __del__(self) -> None:
  237. self.free()
  238. def __repr__(self) -> str:
  239. return f"<{self.kind} native object at 0x{self.ptr:x}>"
  240. def MeasureArena() -> NativeObj:
  241. return NativeObj("ggml_allocr", ggml_allocr_new_measure(GGML_MEM_ALIGN))
  242. def FixedSizeArena(mem_size: int) -> NativeObj:
  243. memory = torch.zeros(mem_size, dtype=torch.uint8)
  244. allocr = ggml_allocr_new(
  245. ctypes.c_void_p(memory.data_ptr()), mem_size, GGML_MEM_ALIGN
  246. )
  247. arena = NativeObj("ggml_allocr", allocr)
  248. # Add a reference from the arena object to the underlying tensor, otherwise it will be freed to early.
  249. setattr(arena, "__memory", memory)
  250. return arena
  251. lib.fairseq2_model_set_inference_ctx.argtypes = [ctypes.c_void_p, ggml_context_p]
  252. def Fairseq2Model() -> NativeObj:
  253. return NativeObj("fairseq2_model")
  254. lib.std_string_alloc.argtypes = [ctypes.c_char_p]
  255. lib.std_string_alloc.restype = ctypes.c_void_p
  256. lib.std_string_free.argtypes = [ctypes.c_void_p]
  257. lib.std_string_free.restype = None
  258. NativeObj._cache["std_string"] = (lib.std_string_alloc, lib.std_string_free)
  259. def CppStr(content: str) -> NativeObj:
  260. c_str = ctypes.create_string_buffer(content.encode("utf-8"))
  261. cpp_str = lib.std_string_alloc(c_str)
  262. return NativeObj("std_string", cpp_str)
  263. lib.load_fairseq2_ggml_file.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
  264. lib.load_fairseq2_ggml_file.restype = ctypes.c_int
  265. def load_fairseq2_ggml_file(model_file: Path) -> NativeObj:
  266. model = Fairseq2Model()
  267. bytes_file = ctypes.create_string_buffer(str(model_file).encode("utf-8"))
  268. err = lib.load_fairseq2_ggml_file(model.ptr, bytes_file)
  269. if err:
  270. raise Exception("Failed to load model")
  271. return model
  272. # lib.unity_audio_encoder_graph.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
  273. # lib.unity_audio_encoder_graph.restype = ctypes.POINTER(ggml_cgraph)
  274. # def unity_audio_encoder_graph(model: NativeObj, tensor: ggml_tensor_p) -> ggml_cgraph_p:
  275. # return lib.unity_audio_encoder_graph(model.ptr, tensor) # type: ignore
  276. # lib.unity_eval.argtypes = [
  277. # ctypes.c_void_p,
  278. # ctypes.c_void_p,
  279. # ctypes.POINTER(ggml_tensor),
  280. # ctypes.c_int,
  281. # ]
  282. # lib.unity_eval.restype = ctypes.POINTER(ggml_cgraph)
  283. # def unity_eval(
  284. # allocr: ctypes.c_void_p, model: NativeObj, tensor: ggml_tensor_p, n_threads: int
  285. # ) -> ggml_cgraph_p:
  286. # return lib.unity_eval(allocr, model.ptr, tensor, n_threads)
  287. _FORWARD_CACHE: Dict[str, Callable[..., ggml_tensor_p]] = {}
  288. def forward(
  289. layer_name: str, model: ctypes.c_void_p, prefix: str, *inputs: ggml_tensor_p
  290. ) -> ggml_tensor_p:
  291. fwd: Any = _FORWARD_CACHE.get(layer_name)
  292. if fwd is None:
  293. fwd = getattr(lib, layer_name + "_forward")
  294. num_inputs = len(inputs)
  295. fwd.argtypes = [ctypes.c_void_p, ctypes.c_void_p] + [
  296. ctypes.POINTER(ggml_tensor)
  297. ] * num_inputs
  298. fwd.restype = ctypes.POINTER(ggml_tensor)
  299. _FORWARD_CACHE[layer_name] = fwd
  300. with CppStr(prefix) as std_prefix:
  301. return fwd(model, std_prefix, *inputs) # ignore: type[no-any-return]
  302. def build_and_compute(
  303. ctx: ggml_context_p, tensor: ggml_tensor_p, num_threads: int = 1
  304. ) -> None:
  305. gf = ggml_build_forward(tensor)
  306. ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), num_threads)
  307. @c_fn(lib)
  308. def causal_attention_mask(
  309. ctx: ggml_context_p, seqs: Ptr[ggml_tensor]
  310. ) -> Ptr[ggml_tensor]:
  311. ...
  312. @c_fn(lib)
  313. def ggml_slice(
  314. ctx: ggml_context_p,
  315. a: Ptr[ggml_tensor],
  316. axis: int,
  317. start: ctypes.c_int64,
  318. end: ctypes.c_int64,
  319. ) -> Ptr[ggml_tensor]:
  320. ...
  321. @c_fn(lib)
  322. def ggml_flatten_1d(
  323. ctx: ggml_context_p, a: Ptr[ggml_tensor], dim: int
  324. ) -> Ptr[ggml_tensor]:
  325. return a
  326. @c_fn(lib)
  327. def ggml_unflatten_1d(
  328. ctx: ggml_context_p, a: Ptr[ggml_tensor], dim: int, num_el: int
  329. ) -> Ptr[ggml_tensor]:
  330. return a
  331. @c_struct
  332. @dataclasses.dataclass
  333. class SequenceGeneratorOptions:
  334. beam_size: int
  335. min_seq_len: int = 5
  336. soft_max_seq_len_a: float = 1.0
  337. soft_max_seq_len_b: int = 200
  338. hard_max_seq_len: int = 1024
  339. len_penalty: float = 1.0
  340. unk_penalty: float = 0.0
  341. normalize_scores: bool = True
  342. @c_struct
  343. @dataclasses.dataclass
  344. class SequenceGeneratorJob:
  345. opts: SequenceGeneratorOptions
  346. prefix_seq: Ptr[ggml_tensor]
  347. pad_idx: int
  348. unk_idx: int
  349. bos_idx: int
  350. eos_idx: int
  351. num_threads: int = 1
  352. @c_struct
  353. class Hypothesis:
  354. seq: Ptr[ggml_tensor]
  355. """The generated sequence."""
  356. score: float
  357. """The score of the hypothesis."""
  358. step_scores: Ptr[ggml_tensor]
  359. """The score of each individual sequence step."""
  360. @c_fn(lib)
  361. def generate_sequence(
  362. model: ctypes.c_void_p,
  363. job: Ptr[SequenceGeneratorJob],
  364. encoder_output: Ptr[ggml_tensor],
  365. encoder_padding_mask: Ptr[ggml_tensor],
  366. result_ctx: ggml_context_p,
  367. ) -> Ptr[Hypothesis]:
  368. ...
  369. @c_fn(lib)
  370. def _testing_return_hypothesis_ptr(ctx: ggml_context_p) -> Ptr[Hypothesis]:
  371. return Ptr()
  372. @c_fn(lib)
  373. def fairseq2_model_layer_config_int(model: ctypes.c_void_p, name: str) -> int:
  374. return -1
  375. @c_fn(lib.fairseq2_kv_cache_alloc)
  376. def _fairseq2_kv_cache_alloc(
  377. model: ctypes.c_void_p, beam_size: int, max_seq_len: int
  378. ) -> None:
  379. pass
  380. @c_fn(lib.fairseq2_kv_cache_reset)
  381. def _fairseq2_kv_cache_reset(model: ctypes.c_void_p) -> None:
  382. pass
  383. @contextlib.contextmanager
  384. def fairseq2_kv_cache_alloc(
  385. model: ctypes.c_void_p, beam_size: int, max_seq_len: int
  386. ) -> Iterator[None]:
  387. _fairseq2_kv_cache_alloc(model, beam_size, max_seq_len)
  388. try:
  389. yield
  390. finally:
  391. _fairseq2_kv_cache_reset(model)