ggml.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491
  1. """
  2. We are vendoring https://github.com/abetlen/ggml-python (MIT License)
  3. adding a few utilities to convert between ggml and numpy tensors for testing.
  4. """
  5. import numpy as np
  6. import ctypes
  7. import torch
  8. import functools
  9. import logging
  10. import dataclasses
  11. from typing import NamedTuple
  12. from pathlib import Path
  13. from typing import Dict
  14. from typing import Callable
  15. from typing import Any
  16. from typing import Tuple
  17. from typing import Union
  18. from typing import Type
  19. from third_party_ggml import *
  20. from ctypes_utils import c_struct, c_fn, Ptr
  21. ### Helpers
  22. @functools.lru_cache(4)
  23. def numpy_dtype(ggml_type: ctypes.c_int) -> np.dtype:
  24. if ggml_type == 0:
  25. # GGML_TYPE_F32 = 0,
  26. return np.dtype(np.float32)
  27. if ggml_type == 1:
  28. # GGML_TYPE_F16 = 1,
  29. return np.dtype(np.float16)
  30. if ggml_type == 18:
  31. return np.dtype(np.int32)
  32. raise NotImplementedError(f"Can't convert GGML_TYPE({ggml_type}) to a numpy.dtype")
  33. def from_numpy_dtype(dtype: np.dtype) -> ctypes.c_int:
  34. if dtype == np.float32:
  35. return ctypes.c_int(0)
  36. elif dtype == np.int32:
  37. return ctypes.c_int(18)
  38. elif dtype == np.float16:
  39. return ctypes.c_int(1)
  40. raise NotImplementedError(f"Can't convert {dtype} to a GGML_TYPE")
  41. def shape(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  42. if isinstance(tensor, ctypes._Pointer):
  43. tensor = tensor.contents
  44. ndims = tensor.n_dims
  45. return tuple([tensor.ne[i] for i in range(ndims)[::-1]])
  46. def nb(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  47. if isinstance(tensor, ctypes._Pointer):
  48. tensor = tensor.contents
  49. return tuple([tensor.nb[i] for i in range(4)])
  50. def ne(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  51. if isinstance(tensor, ctypes._Pointer):
  52. tensor = tensor.contents
  53. return tuple([tensor.ne[i] for i in range(4)])
  54. def strides(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
  55. if isinstance(tensor, ctypes._Pointer):
  56. tensor = tensor.contents
  57. ndims = tensor.n_dims
  58. num_bytes = tuple([tensor.nb[i] for i in range(ndims)])
  59. strides = num_bytes[::-1]
  60. return strides
  61. def to_numpy(tensor_p: ggml_tensor_p) -> np.ndarray:
  62. if not ggml_is_contiguous(tensor_p):
  63. if not _almost_contiguous(tensor_p):
  64. return _strided_to_numpy(tensor_p)
  65. tensor = tensor_p.contents
  66. res = _void_p_to_np_array(tensor.data, shape(tensor), numpy_dtype(tensor.type))
  67. if ggml_is_transposed(tensor_p):
  68. # Patch up strides to work with transposed ggml_tensor
  69. res.strides = strides(tensor) # type: ignore[assignment]
  70. return res
  71. def _almost_contiguous(tensor_p: ggml_tensor_p) -> bool:
  72. """Distinguishes between fully strided and just transposed."""
  73. tensor = tensor_p.contents
  74. num_bytes = nb(tensor)
  75. num_elem = ne(tensor)
  76. # Sort the axis according to 'num_bytes'
  77. nbe = sorted(zip(num_bytes, num_elem))
  78. itemsize = ggml_type_size(tensor.type)
  79. stride_exp = itemsize
  80. for stride, e in nbe:
  81. if stride != stride_exp:
  82. return False
  83. stride_exp *= e
  84. return True
  85. def _strided_to_numpy(tensor_p: ggml_tensor_p) -> np.ndarray:
  86. if ggml_is_transposed(tensor_p):
  87. raise NotImplementedError(
  88. "to_numpy doesn't support tensors both transposed and strided."
  89. )
  90. tensor = tensor_p.contents
  91. n_dim = tensor.n_dims
  92. t_shape = shape(tensor)
  93. t_strides = strides(tensor)
  94. type_size = ggml_type_size(tensor.type)
  95. full_shape = []
  96. num_bytes = nb(tensor)
  97. # Determine the full backing slice of bytes to read.
  98. # TODO make this work for transposed array
  99. n = 1
  100. total_elements = 1
  101. try:
  102. for d in range(n_dim - 1):
  103. n = num_bytes[d + 1] // type_size // n
  104. full_shape.append(n)
  105. total_elements *= n
  106. except ZeroDivisionError:
  107. logging.warning("Can't convert permuted GGML tensor back to numpy")
  108. return None
  109. # We don't need to guess for the first dimension, since this doesn't impact striding.
  110. full_shape.append(t_shape[0])
  111. total_elements *= t_shape[0]
  112. full_shape = full_shape[::-1]
  113. res = _void_p_to_np_array(tensor.data, tuple(full_shape), numpy_dtype(tensor.type))
  114. # Extract the correct slice
  115. res = res.__getitem__(tuple(slice(0, n) for n in t_shape))
  116. # TODO: we could handle transposition here
  117. return res
  118. def _void_p_to_np_array(
  119. data: ctypes.c_void_p, shape: Tuple[int, ...], dtype: np.dtype
  120. ) -> np.ndarray:
  121. # Convert the ggml data pointer to a pointer of bytes
  122. # This is needed because Python ctypes doesn't have "float16", and `as_array` only works with ctypes
  123. int_width: type = getattr(ctypes, f"c_uint{8 * dtype.itemsize}")
  124. ptr = ctypes.cast(data, ctypes.POINTER(int_width))
  125. # Create a numpy array with the wrong dtype
  126. int_arr = np.ctypeslib.as_array(ptr, shape=shape)
  127. # Reinterpret it to the right dtype
  128. return np.frombuffer(int_arr, dtype=dtype).reshape(shape)
  129. GgmlNElem = ctypes.c_int64 * GGML_MAX_DIMS
  130. GgmlNBytes = ctypes.c_uint64 * GGML_MAX_DIMS
  131. def from_file(
  132. ctx: ggml_context_p, file: Path, shape: Tuple[int, ...], dtype: type = np.float32
  133. ) -> ggml_tensor_p:
  134. data = np.fromfile(str(file), dtype=dtype).reshape(shape) # type: ignore
  135. return from_numpy(ctx, data)
  136. def _shape_to_ne(shape: Tuple[int, ...]) -> Tuple[int, int, int, int]:
  137. # in GGML ne[0] indicates the contiguous dimension, ie the last one in numpy and torch
  138. ne = shape[::-1]
  139. if len(ne) >= GGML_MAX_DIMS:
  140. return ne # type: ignore
  141. # ne is always of the same length
  142. padding = (1,) * (GGML_MAX_DIMS - len(ne))
  143. return ne + padding # type: ignore
  144. def _compute_nbytes(
  145. ne: Tuple[int, int, int, int], type: ctypes.c_int
  146. ) -> Tuple[int, int, int, int]:
  147. nb0 = ggml_type_size(type)
  148. nb1 = nb0 * (ne[0] // ggml_blck_size(type))
  149. nb2 = nb1 * ne[1]
  150. nb3 = nb2 * ne[2]
  151. return (nb0, nb1, nb2, nb3)
  152. def from_numpy(
  153. ctx: ggml_context_p, array: Union[np.ndarray, "torch.Tensor"], name: bytes = b""
  154. ) -> Ptr[ggml_tensor]:
  155. if type(array).__name__ == "Tensor":
  156. array = array.numpy()
  157. # Create an empty tensor so we don't allocate memory for the data pointer
  158. gtype = from_numpy_dtype(array.dtype)
  159. tensor_p = ggml_new_tensor_1d(ctx, gtype, 0)
  160. # Fill out the correct dimensions and shape.
  161. tensor_p.contents.n_dims = array.ndim
  162. ne = _shape_to_ne(array.shape)
  163. tensor_p.contents.ne = GgmlNElem(*ne)
  164. tensor_p.contents.nb = GgmlNBytes(*_compute_nbytes(ne, gtype))
  165. # point the tensor data to the content of the numpy array.
  166. tensor_p.contents.data = array.ctypes.data_as(ctypes.c_void_p)
  167. # print(f"array: {array.shape} @0x{array.ctypes.data_as(ctypes.c_void_p)}")
  168. # print(f"tensor_p: {shape(tensor_p)} @0x{tensor_p.contents.data:x}")
  169. # prevent the underlying numpy array to be freed
  170. setattr(tensor_p, "__data", array)
  171. if name:
  172. ggml_set_name(tensor_p, name)
  173. return tensor_p # type: ignore
  174. def ggml_can_mul_mat(t0: ggml_tensor_p, t1: ggml_tensor_p) -> bool:
  175. assert GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"
  176. return (
  177. (t0.contents.ne[0] == t1.contents.ne[0])
  178. and (t1.contents.ne[2] % t0.contents.ne[2] == 0)
  179. and (t1.contents.ne[3] % t0.contents.ne[3] == 0)
  180. )
  181. def nodes(gf: ggml_cgraph) -> Dict[bytes, ggml_tensor_p]:
  182. res = {}
  183. for i in range(gf.n_nodes):
  184. name = gf.nodes[i].contents.name
  185. res[name] = gf.nodes[i]
  186. return res
  187. def leafs(gf: ggml_cgraph) -> Dict[bytes, ggml_tensor_p]:
  188. res = {}
  189. for i in range(gf.n_leafs):
  190. name = gf.leafs[i].contents.name
  191. res[name] = gf.leafs[i]
  192. return res
  193. class NativeObj:
  194. AllocFn = Callable[[], ctypes.c_void_p]
  195. FreeFn = Callable[[ctypes.c_void_p], None]
  196. _cache: Dict[str, Tuple[AllocFn, FreeFn]] = {}
  197. @classmethod
  198. def _init_c_func(cls, kind: str) -> Tuple[AllocFn, FreeFn]:
  199. if kind in cls._cache:
  200. return cls._cache[kind]
  201. alloc_fn = getattr(lib, f"{kind}_alloc")
  202. alloc_fn.argtypes = []
  203. alloc_fn.restype = ctypes.c_void_p
  204. free_fn = getattr(lib, f"{kind}_free")
  205. free_fn.argtypes = [ctypes.c_void_p]
  206. free_fn.restype = None
  207. cls._cache[kind] = (alloc_fn, free_fn)
  208. return (alloc_fn, free_fn)
  209. def __init__(self, kind: str, ptr: ctypes.c_void_p = NULL):
  210. self.kind = kind
  211. alloc_fn, self._free_fn = self._init_c_func(kind)
  212. self.ptr = alloc_fn() if ptr is None else ptr
  213. # print(self)
  214. def free(self) -> None:
  215. if self.ptr is not None:
  216. self._free_fn(self.ptr)
  217. # print(f"freeing {self}")
  218. self.ptr = NULL
  219. def __enter__(self) -> ctypes.c_void_p:
  220. return self.ptr
  221. def __exit__(self, *args: Any) -> None:
  222. self.free()
  223. def __del__(self) -> None:
  224. self.free()
  225. def __repr__(self) -> str:
  226. return f"<{self.kind} native object at 0x{self.ptr:x}>"
  227. def MeasureArena() -> NativeObj:
  228. return NativeObj("ggml_allocr", ggml_allocr_new_measure(GGML_MEM_ALIGN))
  229. def FixedSizeArena(mem_size: int) -> NativeObj:
  230. memory = torch.zeros(mem_size, dtype=torch.uint8)
  231. allocr = ggml_allocr_new(
  232. ctypes.c_void_p(memory.data_ptr()), mem_size, GGML_MEM_ALIGN
  233. )
  234. arena = NativeObj("ggml_allocr", allocr)
  235. # Add a reference from the arena object to the underlying tensor, otherwise it will be freed to early.
  236. setattr(arena, "__memory", memory)
  237. return arena
  238. lib.fairseq2_model_set_inference_ctx.argtypes = [ctypes.c_void_p, ggml_context_p]
  239. def Fairseq2Model() -> NativeObj:
  240. return NativeObj("fairseq2_model")
  241. lib.std_string_alloc.argtypes = [ctypes.c_char_p]
  242. lib.std_string_alloc.restype = ctypes.c_void_p
  243. lib.std_string_free.argtypes = [ctypes.c_void_p]
  244. lib.std_string_free.restype = None
  245. NativeObj._cache["std_string"] = (lib.std_string_alloc, lib.std_string_free)
  246. def CppStr(content: str) -> NativeObj:
  247. c_str = ctypes.create_string_buffer(content.encode("utf-8"))
  248. cpp_str = lib.std_string_alloc(c_str)
  249. return NativeObj("std_string", cpp_str)
  250. lib.load_fairseq2_ggml_file.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
  251. lib.load_fairseq2_ggml_file.restype = ctypes.c_int
  252. def load_fairseq2_ggml_file(model_file: Path) -> NativeObj:
  253. model = Fairseq2Model()
  254. bytes_file = ctypes.create_string_buffer(str(model_file).encode("utf-8"))
  255. err = lib.load_fairseq2_ggml_file(model.ptr, bytes_file)
  256. if err:
  257. raise Exception("Failed to load model")
  258. return model
  259. # lib.unity_audio_encoder_graph.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
  260. # lib.unity_audio_encoder_graph.restype = ctypes.POINTER(ggml_cgraph)
  261. # def unity_audio_encoder_graph(model: NativeObj, tensor: ggml_tensor_p) -> ggml_cgraph_p:
  262. # return lib.unity_audio_encoder_graph(model.ptr, tensor) # type: ignore
  263. # lib.unity_eval.argtypes = [
  264. # ctypes.c_void_p,
  265. # ctypes.c_void_p,
  266. # ctypes.POINTER(ggml_tensor),
  267. # ctypes.c_int,
  268. # ]
  269. # lib.unity_eval.restype = ctypes.POINTER(ggml_cgraph)
  270. # def unity_eval(
  271. # allocr: ctypes.c_void_p, model: NativeObj, tensor: ggml_tensor_p, n_threads: int
  272. # ) -> ggml_cgraph_p:
  273. # return lib.unity_eval(allocr, model.ptr, tensor, n_threads)
  274. _FORWARD_CACHE: Dict[str, Callable[..., ggml_tensor_p]] = {}
  275. def forward(
  276. layer_name: str, model: ctypes.c_void_p, prefix: str, *inputs: ggml_tensor_p
  277. ) -> ggml_tensor_p:
  278. fwd: Any = _FORWARD_CACHE.get(layer_name)
  279. if fwd is None:
  280. fwd = getattr(lib, layer_name + "_forward")
  281. num_inputs = len(inputs)
  282. fwd.argtypes = [ctypes.c_void_p, ctypes.c_void_p] + [
  283. ctypes.POINTER(ggml_tensor)
  284. ] * num_inputs
  285. fwd.restype = ctypes.POINTER(ggml_tensor)
  286. _FORWARD_CACHE[layer_name] = fwd
  287. with CppStr(prefix) as std_prefix:
  288. return fwd(model, std_prefix, *inputs) # ignore: type[no-any-return]
  289. def build_and_compute(
  290. ctx: ggml_context_p, tensor: ggml_tensor_p, num_threads: int = 1
  291. ) -> None:
  292. gf = ggml_build_forward(tensor)
  293. ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), num_threads)
  294. @c_fn(lib)
  295. def causal_attention_mask(
  296. ctx: ggml_context_p, seqs: Ptr[ggml_tensor]
  297. ) -> Ptr[ggml_tensor]:
  298. ...
  299. @c_fn(lib)
  300. def ggml_slice(
  301. ctx: ggml_context_p,
  302. a: Ptr[ggml_tensor],
  303. axis: int,
  304. start: ctypes.c_int64,
  305. end: ctypes.c_int64,
  306. ) -> Ptr[ggml_tensor]:
  307. ...
  308. @c_fn(lib)
  309. def ggml_flatten_1d(
  310. ctx: ggml_context_p, a: Ptr[ggml_tensor], dim: int
  311. ) -> Ptr[ggml_tensor]:
  312. return a
  313. @c_fn(lib)
  314. def ggml_unflatten_1d(
  315. ctx: ggml_context_p, a: Ptr[ggml_tensor], dim: int, num_el: int
  316. ) -> Ptr[ggml_tensor]:
  317. return a
  318. @c_struct
  319. @dataclasses.dataclass
  320. class SequenceGeneratorOptions:
  321. beam_size: int
  322. min_seq_len: int = 5
  323. soft_max_seq_len_a: float = 1.0
  324. soft_max_seq_len_b: int = 200
  325. hard_max_seq_len: int = 1024
  326. len_penalty: float = 1.0
  327. unk_penalty: float = 0.0
  328. normalize_scores: bool = True
  329. @c_struct
  330. @dataclasses.dataclass
  331. class SequenceGeneratorJob:
  332. opts: SequenceGeneratorOptions
  333. prefix_seq: Ptr[ggml_tensor]
  334. pad_idx: int
  335. unk_idx: int
  336. bos_idx: int
  337. eos_idx: int
  338. num_threads: int = 1
  339. @c_struct
  340. class Hypothesis:
  341. seq: Ptr[ggml_tensor]
  342. """The generated sequence."""
  343. score: float
  344. """The score of the hypothesis."""
  345. step_scores: Ptr[ggml_tensor]
  346. """The score of each individual sequence step."""
  347. @c_fn(lib)
  348. def generate_sequence(
  349. model: ctypes.c_void_p,
  350. job: Ptr[SequenceGeneratorJob],
  351. encoder_output: Ptr[ggml_tensor],
  352. encoder_padding_mask: Ptr[ggml_tensor],
  353. result_ctx: ggml_context_p,
  354. ) -> Ptr[Hypothesis]:
  355. ...
  356. @c_fn(lib)
  357. def _testing_return_hypothesis_ptr(ctx: ggml_context_p) -> Ptr[Hypothesis]:
  358. return Ptr()
  359. @c_fn(lib)
  360. def fairseq2_model_layer_config_int(model: ctypes.c_void_p, name: str) -> int:
  361. return -1
  362. @c_fn(lib)
  363. def fairseq2_kv_cache_alloc(
  364. model: ctypes.c_void_p, beam_size: int, max_seq_len: int
  365. ) -> None:
  366. pass