|
@@ -11,7 +11,7 @@ import struct
|
|
from enum import Enum
|
|
from enum import Enum
|
|
from io import BufferedWriter
|
|
from io import BufferedWriter
|
|
from pathlib import Path
|
|
from pathlib import Path
|
|
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
|
|
|
|
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Set, final
|
|
|
|
|
|
import torch
|
|
import torch
|
|
from fairseq2.assets import AssetCard
|
|
from fairseq2.assets import AssetCard
|
|
@@ -19,10 +19,147 @@ from fairseq2.models.transformer.frontend import TransformerEmbeddingFrontend
|
|
from fairseq2.nn import SinusoidalPositionEncoder
|
|
from fairseq2.nn import SinusoidalPositionEncoder
|
|
from fairseq2.nn.transformer import RelativePositionalEncoding
|
|
from fairseq2.nn.transformer import RelativePositionalEncoding
|
|
from seamless_communication.models import unity
|
|
from seamless_communication.models import unity
|
|
|
|
+from fairseq2.data.text import SentencePieceTokenizerBase
|
|
|
|
+from fairseq2.data.typing import PathLike
|
|
|
|
+from typing import Sequence
|
|
|
|
+from fairseq2.data.text import SentencePieceEncoder, SentencePieceTokenizerBase
|
|
|
|
+from fairseq2.typing import Device, finaloverride
|
|
|
|
+from fairseq2.models.utils import TokenizerLoaderBase
|
|
|
|
+from fairseq2.assets import asset_store, download_manager
|
|
|
|
+from seamless_communication.models.unity.builder import UnitYConfig, create_unity_model
|
|
|
|
+from fairseq2.models.utils import ModelLoader
|
|
|
|
+from seamless_communication.models.unity.model import UnitYModel
|
|
|
|
|
|
import ggml
|
|
import ggml
|
|
|
|
|
|
Preprocessor = Callable[[Any], Any]
|
|
Preprocessor = Callable[[Any], Any]
|
|
|
|
+SMALLER_MODELS = [
|
|
|
|
+ "unity_nano",
|
|
|
|
+ "unity_micro",
|
|
|
|
+] # Trained with fairseq2, with custom dict (not original NLLB ones)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+@final
|
|
|
|
+class NllbLikeTokenizer(SentencePieceTokenizerBase):
|
|
|
|
+ """The only difference between this class and NllbTokenizer is it doesn't add a <pad> to control symbol list.
|
|
|
|
+ Since NllbTokenizer is defined as final, we couldn't inherit from it directly. So copying ~everything"""
|
|
|
|
+
|
|
|
|
+ langs: Set[str]
|
|
|
|
+ default_lang: str
|
|
|
|
+
|
|
|
|
+ def __init__(
|
|
|
|
+ self, pathname: PathLike, langs: Sequence[str], default_lang: str
|
|
|
|
+ ) -> None:
|
|
|
|
+ """
|
|
|
|
+ :param pathname:
|
|
|
|
+ The pathname of the SentencePiece model file.
|
|
|
|
+ :param langs:
|
|
|
|
+ The list of supported languages.
|
|
|
|
+ :param default_lang:
|
|
|
|
+ The fall-back language if no language is specified.
|
|
|
|
+ """
|
|
|
|
+ # Each language is represented by a `__lang__` control symbol.
|
|
|
|
+ control_symbols = [f"__{lang}__" for lang in langs]
|
|
|
|
+
|
|
|
|
+ # Internal control symbols that are not relevant for eval use.
|
|
|
|
+ control_symbols.extend(["<MINED_DATA>", "<MMT_BT_DATA>", "<SMT_BT_DATA>"])
|
|
|
|
+ super().__init__(pathname, control_symbols)
|
|
|
|
+
|
|
|
|
+ self.langs = set(langs)
|
|
|
|
+
|
|
|
|
+ self.default_lang = default_lang
|
|
|
|
+
|
|
|
|
+ @finaloverride
|
|
|
|
+ def create_encoder(
|
|
|
|
+ self,
|
|
|
|
+ *,
|
|
|
|
+ task: Optional[str] = None,
|
|
|
|
+ lang: Optional[str] = None,
|
|
|
|
+ mode: Optional[str] = None,
|
|
|
|
+ device: Optional[Device] = None,
|
|
|
|
+ pin_memory: bool = False,
|
|
|
|
+ ) -> SentencePieceEncoder:
|
|
|
|
+ """Create a token encoder.
|
|
|
|
+
|
|
|
|
+ :param task:
|
|
|
|
+ Must be 'translation'. If ``None``, defaults to 'translation'.
|
|
|
|
+ :param lang:
|
|
|
|
+ A language from :attr:`langs`. If ``None``, defaults to
|
|
|
|
+ :attr:`default_lang`.
|
|
|
|
+ :param mode:
|
|
|
|
+ Must be 'source' or 'target'. Set to 'source' if ``lang`` is the
|
|
|
|
+ source language; set to 'target' if ``lang`` is the target language.
|
|
|
|
+ If ``None``, defaults to 'source'.
|
|
|
|
+ :param device:
|
|
|
|
+ The device on which to construct tensors.
|
|
|
|
+ :param pin_memory:
|
|
|
|
+ If ``True``, uses pinned memory while constructing tensors.
|
|
|
|
+ """
|
|
|
|
+ if task is not None and task != "translation":
|
|
|
|
+ raise ValueError(f"`task` must be 'translation', but is '{task}' instead.")
|
|
|
|
+
|
|
|
|
+ if lang is None:
|
|
|
|
+ lang = self.default_lang
|
|
|
|
+
|
|
|
|
+ if lang not in self.langs:
|
|
|
|
+ raise ValueError(
|
|
|
|
+ f"`lang` must be a supported language, but is '{lang}' instead."
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ if mode is None or mode == "source":
|
|
|
|
+ # NLLB models expect a language token in place of BOS in source
|
|
|
|
+ # sequences.
|
|
|
|
+ prefix_tokens = [f"__{lang}__"]
|
|
|
|
+ suffix_tokens = ["</s>"]
|
|
|
|
+ elif mode == "source_mining":
|
|
|
|
+ prefix_tokens = [f"__{lang}__", "<MINED_DATA>"]
|
|
|
|
+ suffix_tokens = ["</s>"]
|
|
|
|
+ elif mode == "source_mmt_bt":
|
|
|
|
+ prefix_tokens = [f"__{lang}__", "<MMT_BT_DATA>"]
|
|
|
|
+ suffix_tokens = ["</s>"]
|
|
|
|
+ elif mode == "source_smt_bt":
|
|
|
|
+ prefix_tokens = [f"__{lang}__", "<SMT_BT_DATA>"]
|
|
|
|
+ suffix_tokens = ["</s>"]
|
|
|
|
+ elif mode == "target":
|
|
|
|
+ # Target sequences are expected to start with an EOS, followed by
|
|
|
|
+ # the language token.
|
|
|
|
+ prefix_tokens = ["</s>", f"__{lang}__"]
|
|
|
|
+ suffix_tokens = []
|
|
|
|
+ else:
|
|
|
|
+ raise ValueError(
|
|
|
|
+ f"`mode` must be 'source' or 'target', but is '{mode}' instead."
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ return SentencePieceEncoder(
|
|
|
|
+ self.model,
|
|
|
|
+ prefix_tokens=prefix_tokens,
|
|
|
|
+ suffix_tokens=suffix_tokens,
|
|
|
|
+ device=device,
|
|
|
|
+ pin_memory=pin_memory,
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+load_unity_model_without_conversion = ModelLoader[UnitYModel, UnitYConfig](
|
|
|
|
+ asset_store,
|
|
|
|
+ download_manager,
|
|
|
|
+ unity.load_unity_config,
|
|
|
|
+ create_unity_model,
|
|
|
|
+ None,
|
|
|
|
+ restrict_checkpoints=False,
|
|
|
|
+)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+@final
|
|
|
|
+class NllbLikeTokenizerLoader(TokenizerLoaderBase[NllbLikeTokenizer]):
|
|
|
|
+ """Loads tokenizers used by NLLB models."""
|
|
|
|
+
|
|
|
|
+ @finaloverride
|
|
|
|
+ def _load(self, pathname: Path, card: AssetCard) -> NllbLikeTokenizer:
|
|
|
|
+ langs = card.field("langs").as_list(str)
|
|
|
|
+
|
|
|
|
+ default_lang = card.field("default_lang").as_(str)
|
|
|
|
+
|
|
|
|
+ return NllbLikeTokenizer(pathname, langs, default_lang)
|
|
|
|
|
|
|
|
|
|
def convert_model(
|
|
def convert_model(
|
|
@@ -44,9 +181,20 @@ def convert_model(
|
|
dataclasses.asdict(model_config), separator="__"
|
|
dataclasses.asdict(model_config), separator="__"
|
|
)
|
|
)
|
|
print(hparams)
|
|
print(hparams)
|
|
- model = unity.load_unity_model(model_name)
|
|
|
|
|
|
+ # Need the diverge here because current default in SC is to convert from fairseq1 ckpt format
|
|
|
|
+ if model_name in SMALLER_MODELS:
|
|
|
|
+ model = load_unity_model_without_conversion(model_name)
|
|
|
|
+ else:
|
|
|
|
+ model = unity.load_unity_model(model_name)
|
|
if vocab is None:
|
|
if vocab is None:
|
|
- tokenizer = unity.load_unity_text_tokenizer(model_name)
|
|
|
|
|
|
+ # Need the diverge here because current default in SC is to add a separate <pad>
|
|
|
|
+ # as control symbol in NllbTokenizer
|
|
|
|
+ if model_name in SMALLER_MODELS:
|
|
|
|
+ tokenizer = NllbLikeTokenizerLoader(asset_store, download_manager)(
|
|
|
|
+ model_name
|
|
|
|
+ )
|
|
|
|
+ else:
|
|
|
|
+ tokenizer = unity.load_unity_text_tokenizer(model_name)
|
|
vocab = read_vocab(tokenizer)
|
|
vocab = read_vocab(tokenizer)
|
|
else:
|
|
else:
|
|
raise ValueError(f"Unsupported model type: {model_name}")
|
|
raise ValueError(f"Unsupported model type: {model_name}")
|