Forráskód Böngészése

[unity.cpp]Unify main branch & WA branch; Fix n_threads (#240)

* Fix for convert to work with unity_micro

* Enable ggml_convert to work with micro/nano & n_threads fix

* remove unnecessary changes

* Use a custom load_unity_model

* support all 4 models
Ning 1 éve
szülő
commit
4e7d540cf2

+ 9 - 7
ggml/examples/unity/fairseq2.cpp

@@ -1143,7 +1143,8 @@ extern "C" void _bootstrap_seqs_and_scores(
     ggml_tensor* full_seqs,
     ggml_tensor* scores,
     ggml_tensor* encoder_output,
-    ggml_tensor* encoder_padding_mask
+    ggml_tensor* encoder_padding_mask,
+    int n_threads
 ) {
     int prefix_seq_len = job.prefix_seq->ne[0];
     int max_seq_len = scores->ne[0];
@@ -1184,7 +1185,7 @@ extern "C" void _bootstrap_seqs_and_scores(
     ggml_tensor* lprobs = ggml_log_softmax(ctx, ggml_slice(ctx, logits, 1, 0, 1));
 
     ggml_cgraph gf = ggml_build_forward(lprobs);
-    ggml_graph_compute_with_ctx(ctx, &gf, 1);
+    ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
     ggml_free(ctx);
     full_seqs->type = GGML_TYPE_I32;
     job.prefix_seq->type = GGML_TYPE_I32;
@@ -1324,7 +1325,8 @@ extern "C" Hypothesis* generate_sequence(
     const SequenceGeneratorJob& job,
     ggml_tensor* encoder_output,
     ggml_tensor* encoder_padding_mask,
-    ggml_context* result_ctx
+    ggml_context* result_ctx, 
+    int n_threads
 ) {
     std::vector<uint8_t> local_bufs[3] = {
         std::vector<uint8_t>(1024 * 1024 * 1024),  // step_ctx
@@ -1361,7 +1363,7 @@ extern "C" Hypothesis* generate_sequence(
     ggml_set_f32(scores, 0.0);
 
     _bootstrap_seqs_and_scores(
-        model, job, seqs, scores, encoder_output, encoder_padding_mask
+        model, job, seqs, scores, encoder_output, encoder_padding_mask, n_threads
     );
     int prefix_seq_len = job.prefix_seq->ne[0];
     int start_step = prefix_seq_len - 1;
@@ -1403,7 +1405,7 @@ extern "C" Hypothesis* generate_sequence(
         // TODO: use ggml properly compute the tweaks
         ggml_cgraph gf = ggml_build_forward(lprobs);
         // printf("beam search step %d. Graph.n_nodes: %d\n", step_nr, gf.n_nodes);
-        ggml_graph_compute_with_ctx(step_ctx, &gf, 1);
+        ggml_graph_compute_with_ctx(step_ctx, &gf, n_threads);
         ggml_detach(lprobs);
 
         _tweak_lprobs(job, lprobs, step_nr, max_seq_len, vocab_size);
@@ -1425,7 +1427,7 @@ extern "C" Hypothesis* generate_sequence(
         }
 
         gf = ggml_build_forward(lprobs);
-        ggml_graph_compute_with_ctx(step_ctx, &gf, 1);
+        ggml_graph_compute_with_ctx(step_ctx, &gf, n_threads);
 
         // Determine (beam, token) candidates for the next step.
         // (N, 2 x B)
@@ -1470,7 +1472,7 @@ extern "C" Hypothesis* generate_sequence(
             ggml_cgraph gf_reorder = ggml_build_forward(new_seqs);
             ggml_build_forward_expand(&gf_reorder, new_scores);
             reorder_kv_cache(model, step_ctx, &gf_reorder, beam_indices);
-            ggml_graph_compute_with_ctx(step_ctx, &gf_reorder, 1);
+            ggml_graph_compute_with_ctx(step_ctx, &gf_reorder, n_threads);
             ggml_detach(new_seqs);
             ggml_detach(new_scores);
             new_seqs->type = GGML_TYPE_I32;

+ 2 - 1
ggml/examples/unity/fairseq2.h

@@ -297,7 +297,8 @@ extern "C" Hypothesis* generate_sequence(
     const SequenceGeneratorJob& opts,
     ggml_tensor* encoder_output,
     ggml_tensor* encoder_padding_mask,
-    ggml_context* result_ctx
+    ggml_context* result_ctx,
+    int threads
 );
 
 extern "C" void fairseq2_spm_tokenize(fairseq2_model* model, const char* text, ggml_tensor& out);

+ 2 - 1
ggml/examples/unity/unity.cpp

@@ -115,7 +115,7 @@ Hypothesis* unity_decode(
     ((int *)prefix_seq->data)[0]  = job.eos_idx;
     ((int *)prefix_seq->data)[1]  = tgt_lang_idx;
     job.prefix_seq = prefix_seq;
-    return generate_sequence(model, job, encoder_output, nullptr, model.ctx);
+    return generate_sequence(model, job, encoder_output, nullptr, model.ctx, n_threads);
 }
 
 int main(int argc, char ** argv) {
@@ -201,6 +201,7 @@ int main(int argc, char ** argv) {
         int n = fairseq2_spm_detokenize(&model, tokens, (char*)&result_str);
         std::cout << std::string((char*)&result_str, n) << std::endl;
         ggml_free(model.ctx);
+
     }
 
     return 0;

+ 151 - 3
ggml/ggml_convert.py

@@ -11,7 +11,7 @@ import struct
 from enum import Enum
 from io import BufferedWriter
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Set, final
 
 import torch
 from fairseq2.assets import AssetCard
@@ -19,10 +19,147 @@ from fairseq2.models.transformer.frontend import TransformerEmbeddingFrontend
 from fairseq2.nn import SinusoidalPositionEncoder
 from fairseq2.nn.transformer import RelativePositionalEncoding
 from seamless_communication.models import unity
+from fairseq2.data.text import SentencePieceTokenizerBase
+from fairseq2.data.typing import PathLike
+from typing import Sequence
+from fairseq2.data.text import SentencePieceEncoder, SentencePieceTokenizerBase
+from fairseq2.typing import Device, finaloverride
+from fairseq2.models.utils import TokenizerLoaderBase
+from fairseq2.assets import asset_store, download_manager
+from seamless_communication.models.unity.builder import UnitYConfig, create_unity_model
+from fairseq2.models.utils import ModelLoader
+from seamless_communication.models.unity.model import UnitYModel
 
 import ggml
 
 Preprocessor = Callable[[Any], Any]
+SMALLER_MODELS = [
+    "unity_nano",
+    "unity_micro",
+]  # Trained with fairseq2, with custom dict (not original NLLB ones)
+
+
+@final
+class NllbLikeTokenizer(SentencePieceTokenizerBase):
+    """The only difference between this class and NllbTokenizer is it doesn't add a <pad> to control symbol list.
+    Since NllbTokenizer is defined as final, we couldn't inherit from it directly. So copying ~everything"""
+
+    langs: Set[str]
+    default_lang: str
+
+    def __init__(
+        self, pathname: PathLike, langs: Sequence[str], default_lang: str
+    ) -> None:
+        """
+        :param pathname:
+            The pathname of the SentencePiece model file.
+        :param langs:
+            The list of supported languages.
+        :param default_lang:
+            The fall-back language if no language is specified.
+        """
+        # Each language is represented by a `__lang__` control symbol.
+        control_symbols = [f"__{lang}__" for lang in langs]
+
+        # Internal control symbols that are not relevant for eval use.
+        control_symbols.extend(["<MINED_DATA>", "<MMT_BT_DATA>", "<SMT_BT_DATA>"])
+        super().__init__(pathname, control_symbols)
+
+        self.langs = set(langs)
+
+        self.default_lang = default_lang
+
+    @finaloverride
+    def create_encoder(
+        self,
+        *,
+        task: Optional[str] = None,
+        lang: Optional[str] = None,
+        mode: Optional[str] = None,
+        device: Optional[Device] = None,
+        pin_memory: bool = False,
+    ) -> SentencePieceEncoder:
+        """Create a token encoder.
+
+        :param task:
+            Must be 'translation'. If ``None``, defaults to 'translation'.
+        :param lang:
+            A language from :attr:`langs`. If ``None``, defaults to
+            :attr:`default_lang`.
+        :param mode:
+            Must be 'source' or 'target'. Set to 'source' if ``lang`` is the
+            source language; set to 'target' if ``lang`` is the target language.
+            If ``None``, defaults to 'source'.
+        :param device:
+            The device on which to construct tensors.
+        :param pin_memory:
+            If ``True``, uses pinned memory while constructing tensors.
+        """
+        if task is not None and task != "translation":
+            raise ValueError(f"`task` must be 'translation', but is '{task}' instead.")
+
+        if lang is None:
+            lang = self.default_lang
+
+        if lang not in self.langs:
+            raise ValueError(
+                f"`lang` must be a supported language, but is '{lang}' instead."
+            )
+
+        if mode is None or mode == "source":
+            # NLLB models expect a language token in place of BOS in source
+            # sequences.
+            prefix_tokens = [f"__{lang}__"]
+            suffix_tokens = ["</s>"]
+        elif mode == "source_mining":
+            prefix_tokens = [f"__{lang}__", "<MINED_DATA>"]
+            suffix_tokens = ["</s>"]
+        elif mode == "source_mmt_bt":
+            prefix_tokens = [f"__{lang}__", "<MMT_BT_DATA>"]
+            suffix_tokens = ["</s>"]
+        elif mode == "source_smt_bt":
+            prefix_tokens = [f"__{lang}__", "<SMT_BT_DATA>"]
+            suffix_tokens = ["</s>"]
+        elif mode == "target":
+            # Target sequences are expected to start with an EOS, followed by
+            # the language token.
+            prefix_tokens = ["</s>", f"__{lang}__"]
+            suffix_tokens = []
+        else:
+            raise ValueError(
+                f"`mode` must be 'source' or 'target', but is '{mode}' instead."
+            )
+
+        return SentencePieceEncoder(
+            self.model,
+            prefix_tokens=prefix_tokens,
+            suffix_tokens=suffix_tokens,
+            device=device,
+            pin_memory=pin_memory,
+        )
+
+
+load_unity_model_without_conversion = ModelLoader[UnitYModel, UnitYConfig](
+    asset_store,
+    download_manager,
+    unity.load_unity_config,
+    create_unity_model,
+    None,
+    restrict_checkpoints=False,
+)
+
+
+@final
+class NllbLikeTokenizerLoader(TokenizerLoaderBase[NllbLikeTokenizer]):
+    """Loads tokenizers used by NLLB models."""
+
+    @finaloverride
+    def _load(self, pathname: Path, card: AssetCard) -> NllbLikeTokenizer:
+        langs = card.field("langs").as_list(str)
+
+        default_lang = card.field("default_lang").as_(str)
+
+        return NllbLikeTokenizer(pathname, langs, default_lang)
 
 
 def convert_model(
@@ -44,9 +181,20 @@ def convert_model(
                     dataclasses.asdict(model_config), separator="__"
                 )
                 print(hparams)
-            model = unity.load_unity_model(model_name)
+            # Need the diverge here because current default in SC is to convert from fairseq1 ckpt format
+            if model_name in SMALLER_MODELS:
+                model = load_unity_model_without_conversion(model_name)
+            else:
+                model = unity.load_unity_model(model_name)
             if vocab is None:
-                tokenizer = unity.load_unity_text_tokenizer(model_name)
+                # Need the diverge here because current default in SC is to add a separate <pad>
+                # as control symbol in NllbTokenizer
+                if model_name in SMALLER_MODELS:
+                    tokenizer = NllbLikeTokenizerLoader(asset_store, download_manager)(
+                        model_name
+                    )
+                else:
+                    tokenizer = unity.load_unity_text_tokenizer(model_name)
                 vocab = read_vocab(tokenizer)
         else:
             raise ValueError(f"Unsupported model type: {model_name}")

+ 18 - 0
src/seamless_communication/cards/unity_micro.yaml

@@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+name: unity_micro
+base: unity_micro_tokenizer
+model_arch: micro
+checkpoint: "file:///large_experiments/seamless/ust/dnn/ggml_models/unity_micro_checkpoint_best.pt"
+num_units: 10000
+unit_langs:
+  - eng
+  - hin
+  - por
+  - rus
+  - spa
+

+ 18 - 0
src/seamless_communication/cards/unity_micro_tokenizer.yaml

@@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+name: unity_micro_tokenizer
+model_type: unity
+tokenizer: "file:///large_experiments/seamless/ust/dnn/ggml_models/5_5_20k.model"
+default_lang: eng
+langs:
+  - eng
+  - rus
+  - por
+  - hin
+  - spa
+
+

+ 17 - 0
src/seamless_communication/cards/unity_nano.yaml

@@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+name: unity_nano
+base: unity_micro_tokenizer
+model_arch: nano
+checkpoint: "file:///large_experiments/seamless/ust/dnn/ggml_models/unity_nano_checkpoint_best.pt"
+num_units: 10000
+unit_langs:
+  - eng
+  - hin
+  - por
+  - rus
+  - spa

+ 155 - 1
src/seamless_communication/models/unity/builder.py

@@ -6,7 +6,7 @@
 
 from dataclasses import dataclass
 from typing import Optional, Union
-
+from fairseq2.data import VocabularyInfo
 from fairseq2.models.conformer import ConformerBlock, ConformerConvolution
 from fairseq2.models.nllb import NllbBuilder, NllbConfig, nllb_archs
 from fairseq2.models.utils.arch_registry import ArchitectureRegistry
@@ -161,6 +161,160 @@ def _medium() -> UnitYConfig:
     )
 
 
+@unity_arch("micro")
+def _micro() -> UnitYConfig:
+    return UnitYConfig(
+        model_dim=512,
+        w2v2_encoder_config=Wav2Vec2EncoderConfig(
+            model_dim=512,
+            max_seq_len=4096,
+            feature_dim=160,
+            use_fbank=True,
+            first_pass_dropout_p=0.0,
+            layer_norm_features=False,
+            feature_extractor_layer_descs=[],
+            feature_extractor_bias=False,
+            feature_extractor_layer_norm_convs=False,
+            feature_grad_scale=0,
+            num_fbank_channels=80,
+            fbank_stride=2,
+            sample_fbank_every_k=1,
+            pos_encoder_type="relative",
+            pos_encoder_depth=1,
+            pos_conv_kernel_size=128,
+            num_pos_conv_groups=16,
+            use_conformer=True,
+            num_encoder_layers=6,
+            num_encoder_attn_heads=16,
+            ffn_inner_dim=512 * 4,
+            dropout_p=0.0,
+            attn_dropout_p=0.0,
+            layer_drop_p=0.0,
+            norm_order=TransformerNormOrder.POST,
+            depthwise_conv_kernel_size=31,
+        ),
+        mt_model_config=NllbConfig(
+            model_dim=512,
+            max_seq_len=1024,
+            vocab_info=VocabularyInfo(
+                size=20010, unk_idx=3, bos_idx=0, eos_idx=2, pad_idx=1
+            ),
+            num_encoder_layers=1,
+            num_decoder_layers=3,
+            num_encoder_attn_heads=16,
+            num_decoder_attn_heads=16,
+            ffn_inner_dim=512 * 8,
+            dropout_p=0.1,
+        ),
+        t2u_config=UnitYT2UConfig(
+            model_dim=512,
+            unit_max_seq_len=2048,
+            target_vocab_info=VocabularyInfo(
+                size=10082, unk_idx=3, bos_idx=0, eos_idx=2, pad_idx=1
+            ),
+            num_encoder_layers=1,
+            num_decoder_layers=1,
+            nar_decoder_frontend_config=None,
+            nar_decoder_config=None,
+            num_encoder_attn_heads=16,
+            num_decoder_attn_heads=16,
+            ffn_inner_dim=512 * 8,
+            dropout_p=0.1,
+            use_gelu=False,
+            char_pad_idx=False,
+            use_prosody_proj=False,
+            prosody_encoder_dim=False,
+        ),
+        use_text_encoder=True,
+        use_conformer_adaptor=False,
+        num_adaptor_layers=1,
+        adaptor_kernel_size=8,
+        adaptor_stride=8,
+        adaptor_layer_norm=True,
+        adaptor_dropout_p=0.1,
+        prosody_encoder_config=None,
+        use_text_decoder=True,
+        use_gelu=False,
+    )
+
+
+@unity_arch("nano")
+def _nano() -> UnitYConfig:
+    return UnitYConfig(
+        model_dim=256,
+        w2v2_encoder_config=Wav2Vec2EncoderConfig(
+            model_dim=256,
+            max_seq_len=4096,
+            feature_dim=160,
+            use_fbank=True,
+            first_pass_dropout_p=0.0,
+            layer_norm_features=False,
+            feature_extractor_layer_descs=[],
+            feature_extractor_bias=False,
+            feature_extractor_layer_norm_convs=False,
+            feature_grad_scale=0,
+            num_fbank_channels=80,
+            fbank_stride=2,
+            sample_fbank_every_k=1,
+            pos_encoder_type="relative",
+            pos_encoder_depth=1,
+            pos_conv_kernel_size=128,
+            num_pos_conv_groups=16,
+            use_conformer=True,
+            num_encoder_layers=6,
+            num_encoder_attn_heads=16,
+            ffn_inner_dim=256 * 4,
+            dropout_p=0.0,
+            attn_dropout_p=0.0,
+            layer_drop_p=0.0,
+            norm_order=TransformerNormOrder.POST,
+            depthwise_conv_kernel_size=31,
+        ),
+        mt_model_config=NllbConfig(
+            model_dim=256,
+            max_seq_len=1024,
+            vocab_info=VocabularyInfo(
+                size=20010, unk_idx=3, bos_idx=0, eos_idx=2, pad_idx=1
+            ),
+            num_encoder_layers=1,
+            num_decoder_layers=3,
+            num_encoder_attn_heads=16,
+            num_decoder_attn_heads=16,
+            ffn_inner_dim=256 * 8,
+            dropout_p=0.1,
+        ),
+        t2u_config=UnitYT2UConfig(
+            model_dim=256,
+            unit_max_seq_len=2048,
+            target_vocab_info=VocabularyInfo(
+                size=10082, unk_idx=3, bos_idx=0, eos_idx=2, pad_idx=1
+            ),
+            num_encoder_layers=1,
+            num_decoder_layers=1,
+            nar_decoder_frontend_config=None,
+            nar_decoder_config=None,
+            num_encoder_attn_heads=16,
+            num_decoder_attn_heads=16,
+            ffn_inner_dim=256 * 8,
+            dropout_p=0.1,
+            use_gelu=False,
+            char_pad_idx=False,
+            use_prosody_proj=False,
+            prosody_encoder_dim=False,
+        ),
+        use_text_encoder=True,
+        use_conformer_adaptor=False,
+        num_adaptor_layers=1,
+        adaptor_kernel_size=8,
+        adaptor_stride=8,
+        adaptor_layer_norm=True,
+        adaptor_dropout_p=0.1,
+        prosody_encoder_config=None,
+        use_text_decoder=True,
+        use_gelu=False,
+    )
+
+
 @unity_arch("base_v2")
 def _base_v2() -> UnitYConfig:
     w2v2_chunk_encoder_config = wav2vec2_chunk_archs.get_config("600m")