ggerganov · compilade · Apr 30, 2024 · Apr 30, 2024 · Apr 30, 2024 · Apr 30, 2024
diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
@@ -86,6 +86,7 @@ let
   # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
   llama-python-extra = python3.withPackages (
     ps: [
+      ps.einops
       ps.numpy
       ps.sentencepiece
       ps.tiktoken

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
diff --git a/convert.py b/convert.py
@@ -281,6 +281,7 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
         n_experts      = None
         n_experts_used = None
         f_rope_freq_base = None
+        n_ff = None
 
         # hack to determine LLaMA v1 vs v2 vs CodeLlama
         if config.get("moe"):
@@ -305,6 +306,8 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
             n_experts_used = config["moe"]["num_experts_per_tok"]
             f_rope_freq_base = 1e6
 
+        assert n_ff is not None
+
         return Params(
             n_vocab          = model["tok_embeddings.weight"].shape[0],
             n_embd           = config["dim"],
@@ -459,7 +462,8 @@ def __init__(self, base_path: Path):
             # not found in alternate location either
             raise FileNotFoundError('Cannot find tokenizer.model')
 
-        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
+        self.sentencepiece_tokenizer = SentencePieceProcessor()
+        self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
         vocab_size = self.sentencepiece_tokenizer.vocab_size()
 
         new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
@@ -479,23 +483,23 @@ def __init__(self, base_path: Path):
     def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         tokenizer = self.sentencepiece_tokenizer
         for i in range(tokenizer.vocab_size()):
-            piece = tokenizer.id_to_piece(i)
+            piece = tokenizer.IdToPiece(i)
             text         = piece.encode("utf-8")
-            score: float = tokenizer.get_score(i)
+            score: float = tokenizer.GetScore(i)
 
             toktype = gguf.TokenType.NORMAL
-            if tokenizer.is_unknown(i):
+            if tokenizer.IsUnknown(i):
                 toktype = gguf.TokenType.UNKNOWN
-            if tokenizer.is_control(i):
+            if tokenizer.IsControl(i):
                 toktype = gguf.TokenType.CONTROL
 
             # NOTE: I think added_tokens are user defined.
             # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
             # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
 
-            if tokenizer.is_unused(i):
+            if tokenizer.IsUnused(i):
                 toktype = gguf.TokenType.UNUSED
-            if tokenizer.is_byte(i):
+            if tokenizer.IsByte(i):
                 toktype = gguf.TokenType.BYTE
 
             yield text, score, toktype
@@ -904,7 +908,7 @@ def load() -> UnquantizedTensor:
     def rebuild_from_type_v2(func, new_type, args, state):
         return func(*args)
 
-    CLASSES = {
+    CLASSES: dict[tuple[str, str], type[LazyTensor] | LazyStorageKind] = {
         # getattr used here as a workaround for mypy not being smart enough to determine
         # the staticmethods have a __func__ attribute.
         ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),

diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
@@ -882,7 +882,7 @@ async def oai_chat_completions(user_prompt,
                     while event_received:
                         event_received = False
                         async for line_in_bytes in response.content:
-                            line = line_in_bytes.decode('utf8')
+                            line = line_in_bytes.decode('utf-8')
                             line = line.rstrip('\n').rstrip('\r')
                             if line == '':
                                 continue

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -861,7 +861,7 @@ def get_type(val: Any) -> GGUFValueType:
 # Note: Does not support GGML_QKK_64
 QK_K = 256
 # Items here are (block size, type size)
-GGML_QUANT_SIZES = {
+GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
     GGMLQuantizationType.F32:     (1, 4),
     GGMLQuantizationType.F16:     (1, 2),
     GGMLQuantizationType.Q4_0:    (32, 2 + 16),

diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py
@@ -63,7 +63,7 @@ class ReaderTensor(NamedTuple):
 
 class GGUFReader:
     # I - same as host, S - swapped
-    byte_order: Literal['I' | 'S'] = 'I'
+    byte_order: Literal['I'] | Literal['S'] = 'I'
     alignment: int = GGUF_DEFAULT_ALIGNMENT
 
     # Note: Internal helper, API may change.
@@ -81,7 +81,7 @@ class GGUFReader:
         GGUFValueType.BOOL:    np.bool_,
     }
 
-    def __init__(self, path: os.PathLike[str] | str, mode: Literal['r' | 'r+' | 'c'] = 'r'):
+    def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r+'] | Literal['c'] = 'r'):
         self.data = np.memmap(path, mode = mode)
         offs = 0
         if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
@@ -126,7 +126,7 @@ def get_tensor(self, idx: int) -> ReaderTensor:
         return self.tensors[idx]
 
     def _get(
-        self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I' | 'S' | '<'] = None,
+        self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I'] | Literal['S'] | Literal['<'] = None,
     ) -> npt.NDArray[Any]:
         count = int(count)
         itemsize = int(np.empty([], dtype = dtype).itemsize)
@@ -248,7 +248,7 @@ def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None:
                 raise ValueError(f'Found duplicated tensor with name {tensor_name}')
             tensor_names.add(tensor_name)
             ggml_type = GGMLQuantizationType(raw_dtype[0])
-            n_elems = np.prod(dims)
+            n_elems = int(np.prod(dims))
             block_size, type_size = GGML_QUANT_SIZES[ggml_type]
             n_bytes = n_elems * type_size // block_size
             data_offs = int(start_offs + offset_tensor[0])

diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
@@ -173,7 +173,7 @@ def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool
         if pack_fmt is not None:
             self.kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL)
         elif vtype == GGUFValueType.STRING:
-            encoded_val = val.encode("utf8") if isinstance(val, str) else val
+            encoded_val = val.encode("utf-8") if isinstance(val, str) else val
             self.kv_data += self._pack("Q", len(encoded_val))
             self.kv_data += encoded_val
         elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
@@ -202,7 +202,7 @@ def add_tensor_info(
             raise ValueError(f'Duplicated tensor name {name}')
         self.ti_names.add(name)
 
-        encoded_name = name.encode("utf8")
+        encoded_name = name.encode("utf-8")
         self.ti_data += self._pack("Q", len(encoded_name))
         self.ti_data += encoded_name
         n_dims = len(tensor_shape)
@@ -476,7 +476,7 @@ def add_add_space_prefix(self, value: bool) -> None:
         self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
 
     def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
-        if isinstance(value, list):
+        if not isinstance(value, str):
             template_default = None
             template_names = set()
 

diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py
@@ -4,7 +4,7 @@
 import os
 import sys
 from pathlib import Path
-from typing import Any, Callable
+from typing import Any, Callable, Sequence, Mapping, Iterable
 
 from .gguf_writer import GGUFWriter
 
@@ -13,11 +13,11 @@ class SpecialVocab:
     merges: list[str]
     add_special_token: dict[str, bool]
     special_token_ids: dict[str, int]
-    chat_template: str | None
+    chat_template: str | Sequence[Mapping[str, str]] | None
 
     def __init__(
         self, path: str | os.PathLike[str], load_merges: bool = False,
-        special_token_types: tuple[str, ...] | None = None,
+        special_token_types: Iterable[str] | None = None,
         n_vocab: int | None = None,
     ):
         self.special_token_ids = {}

diff --git a/gguf-py/scripts/gguf-dump.py b/gguf-py/scripts/gguf-dump.py
@@ -43,7 +43,7 @@ def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
         if len(field.types) == 1:
             curr_type = field.types[0]
             if curr_type == GGUFValueType.STRING:
-                print(' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60])), end = '')
+                print(' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf-8')[:60])), end = '')
             elif field.types[0] in reader.gguf_scalar_to_np:
                 print(' = {0}'.format(field.parts[-1][0]), end = '')
         print()

diff --git a/gguf-py/scripts/gguf-new-metadata.py b/gguf-py/scripts/gguf-new-metadata.py
@@ -7,7 +7,7 @@
 from pathlib import Path
 
 import numpy as np
-from typing import Any, Mapping, Sequence
+from typing import Any, Sequence
 
 # Necessary to load the local gguf package
 if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
@@ -34,19 +34,19 @@ def get_byteorder(reader: gguf.GGUFReader) -> gguf.GGUFEndian:
         return host_endian
 
 
-def decode_field(field: gguf.ReaderField) -> Any:
+def decode_field(field: gguf.ReaderField | None) -> Any:
     if field and field.types:
         main_type = field.types[0]
 
         if main_type == gguf.GGUFValueType.ARRAY:
             sub_type = field.types[-1]
 
             if sub_type == gguf.GGUFValueType.STRING:
-                return [str(bytes(field.parts[idx]), encoding='utf8') for idx in field.data]
+                return [str(bytes(field.parts[idx]), encoding='utf-8') for idx in field.data]
             else:
                 return [pv for idx in field.data for pv in field.parts[idx].tolist()]
         if main_type == gguf.GGUFValueType.STRING:
-            return str(bytes(field.parts[-1]), encoding='utf8')
+            return str(bytes(field.parts[-1]), encoding='utf-8')
         else:
             return field.parts[-1][0]
 
@@ -59,7 +59,7 @@ def get_field_data(reader: gguf.GGUFReader, key: str) -> Any:
     return decode_field(field)
 
 
-def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: Mapping[str, str], remove_metadata: Sequence[str]) -> None:
+def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: dict[str, str], remove_metadata: Sequence[str]) -> None:
     for field in reader.fields.values():
         # Suppress virtual fields and fields written by GGUFWriter
         if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'):
@@ -101,7 +101,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
 
     for tensor in reader.tensors:
         # Dimensions are written in reverse order, so flip them first
-        shape = np.flipud(tensor.shape)
+        shape = np.flipud(tensor.shape).tolist()
 "shape": tensor.shape.tolist(), 
 prettydims = ', '.join('{0:5}'.format(d) for d in list(tensor.shape) + [1] * (4 - len(tensor.shape))) 
 shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape 
 self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype) 
 self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype) 
 for i in range(n_dims): 
     self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i]) 
 "shape": tensor.shape.tolist(), 
 prettydims = ', '.join('{0:5}'.format(d) for d in list(tensor.shape) + [1] * (4 - len(tensor.shape))) 
 shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape 
 self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype) 
 self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype) 
 for i in range(n_dims): 
     self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i]) 
         writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
 
     writer.write_header_to_file()

diff --git a/pyrightconfig.json b/pyrightconfig.json
@@ -0,0 +1,3 @@
+{
+  "extraPaths": ["gguf-py"],
+}
diff --git a/requirements/requirements-convert.txt b/requirements/requirements-convert.txt
@@ -1,5 +1,5 @@
 numpy~=1.24.4
-sentencepiece~=0.1.98
+sentencepiece~=0.2.0
 transformers>=4.35.2,<5.0.0
 gguf>=0.1.0
 protobuf>=4.21.0,<5.0.0