2 years ago · f4dffda0f8
--- a/src/seamless_communication/assets/cards/s2t_m4t_v2.yaml
+++ b/src/seamless_communication/assets/cards/s2t_m4t_v2.yaml
@@ -1,10 +0,0 @@
 
															-# Copyright (c) Meta Platforms, Inc. and affiliates.
														
 
															-# All rights reserved.
														
 
															-#
														
 
															-# This source code is licensed under the BSD-style license found in the
														
 
															-# LICENSE file in the root directory of this source tree.
														
 
															-
														
 
															-name: s2t_m4t_v2
														
 
															-base: unity_nllb-100
														
 
															-model_arch: s2t_base_v2
														
 
															-checkpoint: "file://large_experiments/seamless/ust/elbayadm/multitasking_models/m4t_v2_s2t.pt"
														
--- a/src/seamless_communication/assets/cards/x2t_m4t_v2.yaml
+++ b/src/seamless_communication/assets/cards/x2t_m4t_v2.yaml
@@ -1,10 +0,0 @@
 
															-# Copyright (c) Meta Platforms, Inc. and affiliates.
														
 
															-# All rights reserved.
														
 
															-#
														
 
															-# This source code is licensed under the BSD-style license found in the
														
 
															-# LICENSE file in the root directory of this source tree.
														
 
															-
														
 
															-name: x2t_m4t_v2
														
 
															-base: unity_nllb-100
														
 
															-model_arch: x2t_base_v2
														
 
															-checkpoint: "file://large_experiments/seamless/ust/elbayadm/multitasking_models/m4t_v2_x2t.pt"
														
--- a/src/seamless_communication/models/inference/__init__.py
+++ b/src/seamless_communication/models/inference/__init__.py
@@ -4,7 +4,7 @@
 
															 # This source code is licensed under the license found in the
														
 
															 # LICENSE file in the root directory of this source tree.
														
 
															 from seamless_communication.models.inference.ngram_repeat_block_processor import (
														
 
															-    NGramRepeatBlockProcessor,
														
 
															+    NGramRepeatBlockProcessor as NGramRepeatBlockProcessor,
														
 
															 )
														
 
															 from seamless_communication.models.inference.translator import (
														
 
															     BatchedSpeechOutput as BatchedSpeechOutput,
														
--- a/src/seamless_communication/models/inference/translator.py
+++ b/src/seamless_communication/models/inference/translator.py
@@ -9,6 +9,7 @@ from pathlib import Path
 
															 from torch import Tensor
														
 
															 from typing import Callable, List, Optional, Tuple, Union, cast
														
 
															+import logging
														
 
															 import torch
														
 
															 import torch.nn as nn
														
@@ -22,7 +23,6 @@ from fairseq2.memory import MemoryBlock
 
															 from fairseq2.nn.padding import get_seqs_and_padding_mask
														
 
															 from fairseq2.typing import DataType, Device
														
 
															-
														
 
															 from seamless_communication.models.unity import (
														
 
															     UnitTokenizer,
														
 
															     UnitYGenerator,
														
@@ -37,6 +37,14 @@ from seamless_communication.models.unity.generator import SequenceToUnitOutput
 
															 from seamless_communication.models.vocoder import load_vocoder_model, Vocoder
														
 
															+logging.basicConfig(
														
 
															+    level=logging.INFO,
														
 
															+    format="%(asctime)s %(levelname)s -- %(name)s: %(message)s",
														
 
															+)
														
 
															+
														
 
															+logger = logging.getLogger(__name__)
														
 
															+
														
 
															+
														
 
															 class Task(Enum):
														
 
															     S2ST = auto()
														
 
															     S2TT = auto()
														
@@ -231,6 +239,17 @@ class Translator(nn.Module):
 
															                     block = MemoryBlock(fb.read())
														
 
															                 decoded_audio = self.decode_audio(block)
														
 
															             else:
														
 
															+                assert (
														
 
															+                    audio.dim() <= 2
														
 
															+                ), "The audio tensor can't be more than 2 dimensions."
														
 
															+                if audio.dim() == 1:
														
 
															+                    audio = audio.unsqueeze(1)
														
 
															+                elif audio.dim() == 2 and audio.size(0) < audio.size(1):
														
 
															+                    logger.warning(
														
 
															+                        f"Transposing audio tensor from (bsz, seq_len) -> (seq_len, bsz)."
														
 
															+                    )
														
 
															+                    audio = audio.transpose(0, 1)
														
 
															+
														
 
															                 decoded_audio = {
														
 
															                     "waveform": audio,
														
 
															                     "sample_rate": sample_rate,
														
--- a/src/seamless_communication/models/unit_extraction/unit_extraction.py
+++ b/src/seamless_communication/models/unit_extraction/unit_extraction.py
@@ -4,14 +4,15 @@
 
															 # This source code is licensed under the license found in the
														
 
															 # LICENSE file in the root directory of this source tree.
														
 
															-from typing import Union
														
 
															+from itertools import groupby
														
 
															 from pathlib import Path
														
 
															+from torch import Tensor, nn
														
 
															+from typing import Union
														
 
															+
														
 
															+import logging
														
 
															 import torch
														
 
															 import torch.nn.functional as F
														
 
															-from itertools import groupby
														
 
															-from torch import Tensor, nn
														
 
															-
														
 
															 from fairseq2.assets.card import AssetCard
														
 
															 from fairseq2.data import Collater
														
 
															 from fairseq2.data.audio import AudioDecoder
														
@@ -30,6 +31,14 @@ from seamless_communication.models.inference import Translator
 
															 from seamless_communication.models.vocoder import load_vocoder_model, Vocoder
														
 
															+logging.basicConfig(
														
 
															+    level=logging.INFO,
														
 
															+    format="%(asctime)s %(levelname)s -- %(name)s: %(message)s",
														
 
															+)
														
 
															+
														
 
															+logger = logging.getLogger(__name__)
														
 
															+
														
 
															+
														
 
															 class UnitExtractor(nn.Module):
														
 
															     """Unit Extractor which converts raw audio into units."""
														
@@ -63,6 +72,15 @@ class UnitExtractor(nn.Module):
 
															                 block = MemoryBlock(fb.read())
														
 
															             decoded_audio = self.decode_audio(block)
														
 
															         else:
														
 
															+            assert audio.dim() <= 2, "The audio tensor can't be more than 2 dimensions."
														
 
															+            if audio.dim() == 1:
														
 
															+                audio = audio.unsqueeze(1)
														
 
															+            elif audio.dim() == 2 and audio.size(0) < audio.size(1):
														
 
															+                logger.warning(
														
 
															+                    f"Transposing audio tensor from (bsz, seq_len) -> (seq_len, bsz)."
														
 
															+                )
														
 
															+                audio = audio.transpose(0, 1)
														
 
															+
														
 
															             decoded_audio = {
														
 
															                 "waveform": audio,
														
 
															                 "sample_rate": sample_rate,
														
--- a/src/seamless_communication/models/unity/builder.py
+++ b/src/seamless_communication/models/unity/builder.py
@@ -166,56 +166,6 @@ def _base_v2() -> UnitYConfig:
 
															     )
														
 
															-@unity_arch("x2t_base_v2")
														
 
															-def _x2t_base_v2() -> UnitYConfig:
														
 
															-    w2v2_chunk_encoder_config = wav2vec2_chunk_archs.get_config("600m")
														
 
															-
														
 
															-    mt_model_config: NllbConfig = nllb_archs.get_config("dense_1b")
														
 
															-
														
 
															-    mt_model_config.vocabulary_size = 256102  # NLLB-100
														
 
															-
														
 
															-    mt_model_config.max_seq_len = 4096
														
 
															-
														
 
															-    return UnitYConfig(
														
 
															-        model_dim=1024,
														
 
															-        w2v2_encoder_config=w2v2_chunk_encoder_config,
														
 
															-        mt_model_config=mt_model_config,
														
 
															-        t2u_config=None,
														
 
															-        use_text_encoder=True,
														
 
															-        use_conformer_adaptor=False,
														
 
															-        num_adaptor_layers=1,
														
 
															-        adaptor_kernel_size=8,
														
 
															-        adaptor_stride=8,
														
 
															-        adaptor_layer_norm=True,
														
 
															-        adaptor_dropout_p=0.0,
														
 
															-    )
														
 
															-
														
 
															-
														
 
															-@unity_arch("s2t_base_v2")
														
 
															-def _s2t_base_v2() -> UnitYConfig:
														
 
															-    w2v2_chunk_encoder_config = wav2vec2_chunk_archs.get_config("600m")
														
 
															-
														
 
															-    mt_model_config: NllbConfig = nllb_archs.get_config("dense_1b")
														
 
															-
														
 
															-    mt_model_config.vocabulary_size = 256102  # NLLB-100
														
 
															-
														
 
															-    mt_model_config.max_seq_len = 4096
														
 
															-
														
 
															-    return UnitYConfig(
														
 
															-        model_dim=1024,
														
 
															-        w2v2_encoder_config=w2v2_chunk_encoder_config,
														
 
															-        mt_model_config=mt_model_config,
														
 
															-        t2u_config=None,
														
 
															-        use_text_encoder=False,
														
 
															-        use_conformer_adaptor=False,
														
 
															-        num_adaptor_layers=1,
														
 
															-        adaptor_kernel_size=8,
														
 
															-        adaptor_stride=8,
														
 
															-        adaptor_layer_norm=True,
														
 
															-        adaptor_dropout_p=0.0,
														
 
															-    )
														
 
															-
														
 
															-
														
 
															 class UnitYBuilder:
														
 
															     """Builds modules of a UnitY model.
														
--- a/src/seamless_communication/models/unity/unit_tokenizer.py
+++ b/src/seamless_communication/models/unity/unit_tokenizer.py
@@ -57,9 +57,9 @@ class UnitTokenizer:
 
															         try:
														
 
															             return (
														
 
															                 self.num_units
														
 
															-                + (self.lang_symbol_repititions - 1) * len(self.langs)
														
 
															+                + (self.lang_symbol_repititions - 1) * (len(self.langs) + 1)
														
 
															                 + self.lang_map[lang]
														
 
															-                + 5
														
 
															+                + 4
														
 
															             )
														
 
															         except KeyError:
														
 
															             langs = ", ".join(self.langs)
														
@@ -73,8 +73,8 @@ class UnitTokenizer:
 
															         relative_idx = (
														
 
															             idx
														
 
															             - self.num_units
														
 
															-            - (self.lang_symbol_repititions - 1) * len(self.langs)
														
 
															-            - 5
														
 
															+            - (self.lang_symbol_repititions - 1) * (len(self.langs) + 1)
														
 
															+            - 4
														
 
															         )
														
 
															         if relative_idx < 0 or relative_idx >= len(self.langs):
														
@@ -92,7 +92,7 @@ class UnitTokenizer:
 
															         :param lang:
														
 
															             The language of generated token indices.
														
 
															         """
														
 
															-        return UnitTokenEncoder(self, lang, device)
														
 
															+        return UnitTokenEncoder(self, lang, self.is_nar_decoder, device=device)
														
 
															     def create_decoder(self) -> "UnitTokenDecoder":
														
 
															         """Create a token decoder."""
														
@@ -106,10 +106,14 @@ class UnitTokenEncoder:
 
															     eos_idx: int
														
 
															     unk_idx: int
														
 
															     lang_idx: int
														
 
															-    prefix_indices: Tensor
														
 
															+    prefix_indices: Optional[Tensor]
														
 
															     def __init__(
														
 
															-        self, tokenizer: UnitTokenizer, lang: str, device: Optional[Device] = None
														
 
															+        self,
														
 
															+        tokenizer: UnitTokenizer,
														
 
															+        lang: str,
														
 
															+        is_nar_decoder: bool,
														
 
															+        device: Optional[Device] = None,
														
 
															     ) -> None:
														
 
															         """
														
 
															         :param tokenizer:
														
@@ -125,6 +129,7 @@ class UnitTokenEncoder:
 
															             )
														
 
															         self.tokenizer = tokenizer
														
 
															+        self.is_nar_decoder = is_nar_decoder
														
 
															         assert tokenizer.vocab_info.eos_idx is not None
														
 
															         assert tokenizer.vocab_info.unk_idx is not None
														
@@ -137,10 +142,13 @@ class UnitTokenEncoder:
 
															         if device is None:
														
 
															             device = Device("cpu")
														
 
															-        # We always start sequences with EOS, followed by the language token.
														
 
															-        self.prefix_indices = torch.tensor(
														
 
															-            [self.eos_idx, self.lang_idx], device=device, dtype=torch.int64
														
 
															-        )
														
 
															+        if not self.is_nar_decoder:
														
 
															+            # We always start sequences with EOS, followed by the language token.
														
 
															+            self.prefix_indices = torch.tensor(
														
 
															+                [self.eos_idx, self.lang_idx], device=device, dtype=torch.int64
														
 
															+            )
														
 
															+        else:
														
 
															+            self.prefix_indices = None
														
 
															     def __call__(self, units: Tensor) -> Tensor:
														
 
															         """Encode ``units`` to token indices.
														
@@ -156,13 +164,18 @@ class UnitTokenEncoder:
 
															         """
														
 
															         batch_size = units.size(0)
														
 
															-        token_indices = torch.cat(
														
 
															-            [self.prefix_indices.clone().expand(batch_size, -1), units.detach()], dim=1
														
 
															-        )
														
 
															+        if self.prefix_indices is not None:
														
 
															+            token_indices = torch.cat(
														
 
															+                [self.prefix_indices.clone().expand(batch_size, -1), units.detach()],
														
 
															+                dim=1,
														
 
															+            )
														
 
															-        # Ensure that non-symbol indices larger than `num_units` are replaced
														
 
															-        # with UNK.
														
 
															-        seqs = token_indices[:, 2:]
														
 
															+            # Ensure that non-symbol indices larger than `num_units` are replaced
														
 
															+            # with UNK.
														
 
															+            seqs = token_indices[:, 2:]
														
 
															+        else:
														
 
															+            token_indices = units.clone().detach()
														
 
															+            seqs = token_indices
														
 
															         # Add offset for control symbols.
														
 
															         seqs += 4
														
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -0,0 +1,9 @@
 
															+# Copyright (c) Meta Platforms, Inc. and affiliates
														
 
															+# All rights reserved.
														
 
															+#
														
 
															+# This source code is licensed under the license found in the
														
 
															+# LICENSE file in the root directory of this source tree.
														
 
															+
														
 
															+import pytest
														
 
															+
														
 
															+pytest.register_assert_rewrite("tests.common")
														
--- a/tests/common.py
+++ b/tests/common.py
@@ -0,0 +1,62 @@
 
															+# Copyright (c) Meta Platforms, Inc. and affiliates
														
 
															+# All rights reserved.
														
 
															+#
														
 
															+# This source code is licensed under the license found in the
														
 
															+# LICENSE file in the root directory of this source tree.
														
 
															+
														
 
															+from contextlib import contextmanager
														
 
															+from typing import Any, Generator, List, Union
														
 
															+
														
 
															+import torch
														
 
															+from torch import Tensor
														
 
															+
														
 
															+from fairseq2.typing import Device
														
 
															+
														
 
															+# The default device that tests should use. Note that pytest can change it based
														
 
															+# on the provided command line arguments.
														
 
															+device = Device("cpu")
														
 
															+
														
 
															+
														
 
															+def assert_close(a: Tensor, b: Union[Tensor, List[Any]]) -> None:
														
 
															+    """Assert that ``a`` and ``b`` are element-wise equal within a tolerance."""
														
 
															+    if not isinstance(b, Tensor):
														
 
															+        b = torch.tensor(b, device=device, dtype=a.dtype)
														
 
															+
														
 
															+    torch.testing.assert_close(a, b)  # type: ignore[attr-defined]
														
 
															+
														
 
															+
														
 
															+def assert_equal(a: Tensor, b: Union[Tensor, List[Any]]) -> None:
														
 
															+    """Assert that ``a`` and ``b`` are element-wise equal."""
														
 
															+    if not isinstance(b, Tensor):
														
 
															+        b = torch.tensor(b, device=device, dtype=a.dtype)
														
 
															+
														
 
															+    torch.testing.assert_close(a, b, rtol=0, atol=0)  # type: ignore[attr-defined]
														
 
															+
														
 
															+
														
 
															+def has_no_inf(a: Tensor) -> bool:
														
 
															+    """Return ``True`` if ``a`` has no positive or negative infinite element."""
														
 
															+    return not torch.any(torch.isinf(a))
														
 
															+
														
 
															+
														
 
															+def has_no_nan(a: Tensor) -> bool:
														
 
															+    """Return ``True`` if ``a`` has no NaN element."""
														
 
															+    return not torch.any(torch.isnan(a))
														
 
															+
														
 
															+
														
 
															+@contextmanager
														
 
															+def tmp_rng_seed(device: Device, seed: int = 0) -> Generator[None, None, None]:
														
 
															+    """Set a temporary manual RNG seed.
														
 
															+
														
 
															+    The RNG is reset to its original state once the block is exited.
														
 
															+    """
														
 
															+    device = Device(device)
														
 
															+
														
 
															+    if device.type == "cuda":
														
 
															+        devices = [device]
														
 
															+    else:
														
 
															+        devices = []
														
 
															+
														
 
															+    with torch.random.fork_rng(devices):
														
 
															+        torch.manual_seed(seed)
														
 
															+
														
 
															+        yield
														
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -0,0 +1,33 @@
 
															+# Copyright (c) Meta Platforms, Inc. and affiliates
														
 
															+# All rights reserved.
														
 
															+#
														
 
															+# This source code is licensed under the license found in the
														
 
															+# LICENSE file in the root directory of this source tree.
														
 
															+
														
 
															+from argparse import ArgumentTypeError
														
 
															+from typing import cast
														
 
															+
														
 
															+import pytest
														
 
															+import tests.common
														
 
															+
														
 
															+from fairseq2.typing import Device
														
 
															+
														
 
															+
														
 
															+def parse_device_arg(value: str) -> Device:
														
 
															+    try:
														
 
															+        return Device(value)
														
 
															+    except RuntimeError:
														
 
															+        raise ArgumentTypeError(f"'{value}' is not a valid device name.")
														
 
															+
														
 
															+
														
 
															+def pytest_addoption(parser: pytest.Parser) -> None:
														
 
															+    # fmt: off
														
 
															+    parser.addoption(
														
 
															+        "--device", default="cpu", type=parse_device_arg,
														
 
															+        help="device on which to run tests (default: %(default)s)",
														
 
															+    )
														
 
															+    # fmt: on
														
 
															+
														
 
															+
														
 
															+def pytest_sessionstart(session: pytest.Session) -> None:
														
 
															+    tests.common.device = cast(Device, session.config.getoption("device"))
														
--- a/tests/integration/__init__.py
+++ b/tests/integration/__init__.py
@@ -0,0 +1,5 @@
 
															+# Copyright (c) Meta Platforms, Inc. and affiliates
														
 
															+# All rights reserved.
														
 
															+#
														
 
															+# This source code is licensed under the license found in the
														
 
															+# LICENSE file in the root directory of this source tree.
														
--- a/tests/integration/models/__init__.py
+++ b/tests/integration/models/__init__.py
@@ -0,0 +1,5 @@
 
															+# Copyright (c) Meta Platforms, Inc. and affiliates
														
 
															+# All rights reserved.
														
 
															+#
														
 
															+# This source code is licensed under the license found in the
														
 
															+# LICENSE file in the root directory of this source tree.
														
--- a/tests/integration/models/test_translator.py
+++ b/tests/integration/models/test_translator.py
@@ -0,0 +1,104 @@
 
															+# Copyright (c) Meta Platforms, Inc. and affiliates
														
 
															+# All rights reserved.
														
 
															+#
														
 
															+# This source code is licensed under the license found in the
														
 
															+# LICENSE file in the root directory of this source tree.
														
 
															+
														
 
															+import torch
														
 
															+from typing import Final
														
 
															+
														
 
															+from fairseq2.typing import Device
														
 
															+from seamless_communication.models.inference import Translator
														
 
															+from tests.common import device
														
 
															+
														
 
															+# fmt: off
														
 
															+ENG_SENTENCE:     Final = "On Monday, scientists from the Stanford University School of Medicine announced the invention of a new diagnostic tool that can sort cells by type: a tiny printable chip that can be manufactured using standard inkjet printers for possibly about one U.S. cent each."
														
 
															+DEU_SENTENCE:     Final = "Am Montag kündigten Wissenschaftler der Stanford University School of Medicine die Erfindung eines neuen Diagnosewerkzeugs an, das Zellen nach Typ sortieren kann: ein winziger druckbarer Chip, der mit Standard-Tintenstrahldruckern für etwa einen US-Cent hergestellt werden kann."
														
 
															+DEU_SENTENCE_V2:  Final = "Am Montag kündigten Wissenschaftler der Stanford University School of Medicine die Erfindung eines neuen diagnostischen Werkzeugs an, das Zellen nach Typ sortieren kann: ein winziger druckbarer Chip, der mit Standard-Tintenstrahldrucker für möglicherweise etwa einen US-Cent pro Stück hergestellt werden kann."
														
 
															+# fmt: on
														
 
															+
														
 
															+
														
 
															+def test_seamless_m4t_large_t2tt() -> None:
														
 
															+    model_name = "seamlessM4T_large"
														
 
															+    src_lang = "eng"
														
 
															+    tgt_lang = "deu"
														
 
															+
														
 
															+    if device == Device("cpu"):
														
 
															+        dtype = torch.float32
														
 
															+    else:
														
 
															+        dtype = torch.float16
														
 
															+
														
 
															+    translator = Translator(model_name, "vocoder_36langs", device, dtype=dtype)
														
 
															+    text_output, _ = translator.predict(
														
 
															+        ENG_SENTENCE,
														
 
															+        "t2tt",
														
 
															+        tgt_lang,
														
 
															+        src_lang=src_lang,
														
 
															+    )
														
 
															+    assert text_output[0] == DEU_SENTENCE, f"'{text_output[0]}' is not '{DEU_SENTENCE}'"
														
 
															+
														
 
															+
														
 
															+def test_seamless_m4t_v2_large_t2tt() -> None:
														
 
															+    model_name = "seamlessM4T_v2_large"
														
 
															+    src_lang = "eng"
														
 
															+    tgt_lang = "deu"
														
 
															+
														
 
															+    if device == Device("cpu"):
														
 
															+        dtype = torch.float32
														
 
															+    else:
														
 
															+        dtype = torch.float16
														
 
															+
														
 
															+    translator = Translator(model_name, "vocoder_commercial", device, dtype=dtype)
														
 
															+    text_output, _ = translator.predict(
														
 
															+        ENG_SENTENCE,
														
 
															+        "t2tt",
														
 
															+        tgt_lang,
														
 
															+        src_lang=src_lang,
														
 
															+    )
														
 
															+    assert (
														
 
															+        text_output[0] == DEU_SENTENCE_V2
														
 
															+    ), f"'{text_output[0]}' is not '{DEU_SENTENCE_V2}'"
														
 
															+
														
 
															+
														
 
															+def test_seamless_m4t_v2_large_multiple_tasks() -> None:
														
 
															+    model_name = "seamlessM4T_v2_large"
														
 
															+    english_text = "Hello! I hope you're all doing well."
														
 
															+    ref_spanish_text = "Hola, espero que todos estéis haciendo bien."
														
 
															+    ref_spanish_asr_text = "Hola, espero que todos estéis haciendo bien."
														
 
															+
														
 
															+    if device == Device("cpu"):
														
 
															+        dtype = torch.float32
														
 
															+    else:
														
 
															+        dtype = torch.float16
														
 
															+
														
 
															+    translator = Translator(model_name, "vocoder_commercial", device, dtype=dtype)
														
 
															+
														
 
															+    # Generate english speech for the english text.
														
 
															+    _, english_speech_output = translator.predict(
														
 
															+        english_text,
														
 
															+        "t2st",
														
 
															+        "eng",
														
 
															+        src_lang="eng",
														
 
															+    )
														
 
															+    assert english_speech_output is not None
														
 
															+
														
 
															+    # Translate english speech to spanish speech.
														
 
															+    spanish_text_output, spanish_speech_output = translator.predict(
														
 
															+        english_speech_output.audio_wavs[0][0],
														
 
															+        "s2st",
														
 
															+        "spa",
														
 
															+    )
														
 
															+    assert spanish_speech_output is not None
														
 
															+    assert (
														
 
															+        spanish_text_output[0] == ref_spanish_text
														
 
															+    ), f"'{spanish_text_output[0]}' is not '{ref_spanish_text}'"
														
 
															+
														
 
															+    # Run ASR on the spanish speech.
														
 
															+    spanish_asr_text_output, _ = translator.predict(
														
 
															+        spanish_speech_output.audio_wavs[0][0],
														
 
															+        "asr",
														
 
															+        "spa",
														
 
															+    )
														
 
															+    assert (
														
 
															+        spanish_asr_text_output[0] == ref_spanish_asr_text
														
 
															+    ), f"{spanish_asr_text_output[0]} is not {ref_spanish_asr_text}'"
														
--- a/tests/integration/models/test_unit_extraction.py
+++ b/tests/integration/models/test_unit_extraction.py
@@ -0,0 +1,49 @@
 
															+# Copyright (c) Meta Platforms, Inc. and affiliates
														
 
															+# All rights reserved.
														
 
															+#
														
 
															+# This source code is licensed under the license found in the
														
 
															+# LICENSE file in the root directory of this source tree.
														
 
															+
														
 
															+import torch
														
 
															+from torch import tensor
														
 
															+from typing import Final
														
 
															+
														
 
															+from fairseq2.typing import Device
														
 
															+from seamless_communication.models.inference import Translator
														
 
															+from seamless_communication.models.unit_extraction import UnitExtractor
														
 
															+from tests.common import assert_equal, device
														
 
															+
														
 
															+
														
 
															+# fmt: off
														
 
															+REF_ENG_UNITS: Final = [8976, 8299,    0,    0, 9692, 5395,  785,  785, 7805, 6193, 2922, 4806, 3362, 3560, 9007, 8119, 8119,  205, 5424, 5424, 5064, 7421, 6547, 9952, 3728, 8544, 3321, 1093, 1443, 7962, 3978, 9631, 5168, 5491, 9133, 9275, 5912, 8729, 5097, 5495, 1650, 5048, 3752, 6756,  963, 5665, 4191, 5205, 5205, 9568, 5092, 5932, 1190, 9339, 5839, 5839, 6244, 5320, 3454, 5216, 721, 6994, 6513, 7754, 3469,  296, 1849, 3254, 3254, 5042, 5042, 3961, 2079, 1907, 1846,  661, 2225,  944, 9295, 4712, 1785, 6060, 8701, 7646, 1355, 2876, 8199, 5901, 8199, 3861, 5153, 6420, 2897, 1389,  334, 6334]
														
 
															+# fmt: on
														
 
															+
														
 
															+
														
 
															+def test_unit_extraction() -> None:
														
 
															+    model_name = "seamlessM4T_v2_large"
														
 
															+    english_text = "Hello! I hope you're all doing well."
														
 
															+
														
 
															+    if device == Device("cpu"):
														
 
															+        dtype = torch.float32
														
 
															+    else:
														
 
															+        dtype = torch.float16
														
 
															+
														
 
															+    translator = Translator(model_name, "vocoder_commercial", device, dtype=dtype)
														
 
															+    unit_extractor = UnitExtractor(
														
 
															+        "xlsr2_1b_v2",
														
 
															+        "https://dl.fbaipublicfiles.com/seamlessM4T/models/unit_extraction/kmeans_10k.npy",
														
 
															+        device=device,
														
 
															+    )
														
 
															+
														
 
															+    # Generate english speech for the english text.
														
 
															+    _, speech_output = translator.predict(
														
 
															+        english_text,
														
 
															+        "t2st",
														
 
															+        "eng",
														
 
															+        src_lang="eng",
														
 
															+    )
														
 
															+    assert speech_output is not None
														
 
															+
														
 
															+    units = unit_extractor.predict(speech_output.audio_wavs[0][0], 34)
														
 
															+
														
 
															+    assert_equal(units, tensor(REF_ENG_UNITS, device=device, dtype=torch.int64))
														
--- a/tests/unit/__init__.py
+++ b/tests/unit/__init__.py
@@ -0,0 +1,5 @@
 
															+# Copyright (c) Meta Platforms, Inc. and affiliates
														
 
															+# All rights reserved.
														
 
															+#
														
 
															+# This source code is licensed under the license found in the
														
 
															+# LICENSE file in the root directory of this source tree.
														
--- a/tests/unit/models/__init__.py
+++ b/tests/unit/models/__init__.py
@@ -0,0 +1,5 @@
 
															+# Copyright (c) Meta Platforms, Inc. and affiliates
														
 
															+# All rights reserved.
														
 
															+#
														
 
															+# This source code is licensed under the license found in the
														
 
															+# LICENSE file in the root directory of this source tree.
														
--- a/tests/unit/models/unity/__init__.py
+++ b/tests/unit/models/unity/__init__.py
@@ -0,0 +1,5 @@
 
															+# Copyright (c) Meta Platforms, Inc. and affiliates
														
 
															+# All rights reserved.
														
 
															+#
														
 
															+# This source code is licensed under the license found in the
														
 
															+# LICENSE file in the root directory of this source tree.
														
--- a/tests/unit/models/unity/test_unity.py
+++ b/tests/unit/models/unity/test_unity.py
@@ -0,0 +1,238 @@
 
															+# Copyright (c) Meta Platforms, Inc. and affiliates
														
 
															+# All rights reserved.
														
 
															+#
														
 
															+# This source code is licensed under the license found in the
														
 
															+# LICENSE file in the root directory of this source tree.
														
 
															+
														
 
															+import pytest
														
 
															+import torch
														
 
															+
														
 
															+from seamless_communication.models.unity import UnitTokenizer
														
 
															+from tests.common import assert_equal, device
														
 
															+
														
 
															+
														
 
															+class TestUnitTokenizer:
														
 
															+    def test_init_works(self) -> None:
														
 
															+        tokenizer = UnitTokenizer(
														
 
															+            num_units=100, langs=["eng", "deu", "fra"], model_arch="seamlessM4T_large"
														
 
															+        )
														
 
															+
														
 
															+        assert tokenizer.num_units == 100
														
 
															+
														
 
															+        assert tokenizer.lang_map == {"eng": 0, "deu": 1, "fra": 2}
														
 
															+
														
 
															+        assert tokenizer.vocab_info.size == 112
														
 
															+
														
 
															+    def test_lang_to_index_works(self) -> None:
														
 
															+        tokenizer = UnitTokenizer(
														
 
															+            num_units=100, langs=["eng", "deu", "fra"], model_arch="seamlessM4T_large"
														
 
															+        )
														
 
															+
														
 
															+        assert tokenizer.lang_to_index("eng") == 108
														
 
															+        assert tokenizer.lang_to_index("deu") == 109
														
 
															+        assert tokenizer.lang_to_index("fra") == 110
														
 
															+
														
 
															+    def test_lang_to_index_works_nar_decoder(self) -> None:
														
 
															+        tokenizer = UnitTokenizer(
														
 
															+            num_units=100,
														
 
															+            langs=["eng", "deu", "fra"],
														
 
															+            model_arch="seamlessM4T_large_v2",
														
 
															+        )
														
 
															+        assert tokenizer.vocab_info.size == 108
														
 
															+
														
 
															+        assert tokenizer.lang_to_index("eng") == 104
														
 
															+        assert tokenizer.lang_to_index("deu") == 105
														
 
															+        assert tokenizer.lang_to_index("fra") == 106
														
 
															+
														
 
															+    def test_lang_to_index_raises_error_when_lang_is_not_supported(self) -> None:
														
 
															+        tokenizer = UnitTokenizer(
														
 
															+            num_units=100, langs=["eng", "deu", "fra"], model_arch="seamlessM4T_large"
														
 
															+        )
														
 
															+
														
 
															+        with pytest.raises(
														
 
															+            ValueError,
														
 
															+            match=r"^`lang` must be one of the supported languages, but is 'foo' instead\. Supported languages: eng, deu, fra$",
														
 
															+        ):
														
 
															+            tokenizer.lang_to_index("foo")
														
 
															+
														
 
															+    def test_index_to_lang_works(self) -> None:
														
 
															+        tokenizer = UnitTokenizer(
														
 
															+            num_units=100, langs=["eng", "deu", "fra"], model_arch="seamlessM4T_large"
														
 
															+        )
														
 
															+
														
 
															+        assert tokenizer.index_to_lang(108) == "eng"
														
 
															+        assert tokenizer.index_to_lang(109) == "deu"
														
 
															+        assert tokenizer.index_to_lang(110) == "fra"
														
 
															+
														
 
															+    def test_index_to_lang_works_nar_decoder(self) -> None:
														
 
															+        tokenizer = UnitTokenizer(
														
 
															+            num_units=100,
														
 
															+            langs=["eng", "deu", "fra"],
														
 
															+            model_arch="seamlessM4T_large_v2",
														
 
															+        )
														
 
															+
														
 
															+        assert tokenizer.index_to_lang(104) == "eng"
														
 
															+        assert tokenizer.index_to_lang(105) == "deu"
														
 
															+        assert tokenizer.index_to_lang(106) == "fra"
														
 
															+
														
 
															+    def test_vocab_control_symbols(self) -> None:
														
 
															+        tokenizer = UnitTokenizer(
														
 
															+            num_units=100, langs=["eng", "deu", "fra"], model_arch="seamlessM4T_large"
														
 
															+        )
														
 
															+
														
 
															+        assert tokenizer.vocab_info.bos_idx == 0
														
 
															+        assert tokenizer.vocab_info.pad_idx == 1
														
 
															+        assert tokenizer.vocab_info.eos_idx == 2
														
 
															+        assert tokenizer.vocab_info.unk_idx == 3
														
 
															+
														
 
															+    def test_index_to_lang_raises_error_when_idx_is_out_of_range(self) -> None:
														
 
															+        tokenizer = UnitTokenizer(
														
 
															+            num_units=100, langs=["eng", "deu", "fra"], model_arch="seamlessM4T_large"
														
 
															+        )
														
 
															+
														
 
															+        with pytest.raises(
														
 
															+            ValueError,
														
 
															+            match=r"^`idx` must correspond to one of the supported language symbol indices \(0 to 2\), but is 1234 instead\.$",
														
 
															+        ):
														
 
															+            tokenizer.index_to_lang(1234)
														
 
															+
														
 
															+
														
 
															+class TestUnitEncoder:
														
 
															+    def test_init_raises_error_when_lang_is_not_supported(self) -> None:
														
 
															+        tokenizer = UnitTokenizer(
														
 
															+            num_units=100, langs=["eng", "deu", "fra"], model_arch="seamlessM4T_large"
														
 
															+        )
														
 
															+
														
 
															+        with pytest.raises(
														
 
															+            ValueError,
														
 
															+            match=r"^`lang` must be one of the supported languages\, but is 'xyz' instead\. Supported languages: eng, deu, fra$",
														
 
															+        ):
														
 
															+            tokenizer.create_encoder(lang="xyz", device=device)
														
 
															+
														
 
															+    def test_call_works(self) -> None:
														
 
															+        tokenizer = UnitTokenizer(
														
 
															+            num_units=100, langs=["eng", "deu", "fra"], model_arch="seamlessM4T_large"
														
 
															+        )
														
 
															+
														
 
															+        prefix = torch.tensor([2, 109], device=device, dtype=torch.int64)
														
 
															+
														
 
															+        encoder = tokenizer.create_encoder(lang="deu", device=device)
														
 
															+
														
 
															+        # Empty units.
														
 
															+        units = torch.ones((1, 0), device=device, dtype=torch.int64)
														
 
															+
														
 
															+        assert_equal(encoder(units), prefix.expand(1, -1))
														
 
															+
														
 
															+        # Batched units.
														
 
															+        units = torch.ones((6, 4), device=device, dtype=torch.int64)
														
 
															+
														
 
															+        assert_equal(
														
 
															+            encoder(units), torch.cat([prefix.expand(6, -1), units + 4], dim=1)
														
 
															+        )
														
 
															+
														
 
															+    def test_call_works_nar_decoder(self) -> None:
														
 
															+        tokenizer = UnitTokenizer(
														
 
															+            num_units=100,
														
 
															+            langs=["eng", "deu", "fra"],
														
 
															+            model_arch="seamlessM4T_large_v2",
														
 
															+        )
														
 
															+
														
 
															+        encoder = tokenizer.create_encoder(lang="deu", device=device)
														
 
															+
														
 
															+        # Empty units.
														
 
															+        units = torch.ones((1, 0), device=device, dtype=torch.int64)
														
 
															+
														
 
															+        assert_equal(encoder(units), units)
														
 
															+
														
 
															+        # Batched units.
														
 
															+        units = torch.ones((6, 4), device=device, dtype=torch.int64)
														
 
															+
														
 
															+        assert_equal(encoder(units), units + 4)
														
 
															+
														
 
															+    def test_call_works_when_units_have_unks(self) -> None:
														
 
															+        tokenizer = UnitTokenizer(
														
 
															+            num_units=100, langs=["eng", "deu", "fra"], model_arch="seamlessM4T_large"
														
 
															+        )
														
 
															+
														
 
															+        encoder = tokenizer.create_encoder(lang="deu", device=device)
														
 
															+
														
 
															+        units = torch.ones((6, 4), device=device, dtype=torch.int64)
														
 
															+
														
 
															+        units[1, 3] = 100
														
 
															+        units[2, 1] = 101
														
 
															+
														
 
															+        token_indices = encoder(units)
														
 
															+
														
 
															+        assert token_indices[1, 5].item() == tokenizer.vocab_info.unk_idx
														
 
															+        assert token_indices[2, 3].item() == tokenizer.vocab_info.unk_idx
														
 
															+
														
 
															+    def test_call_works_when_units_have_unks_nar_decoder(self) -> None:
														
 
															+        tokenizer = UnitTokenizer(
														
 
															+            num_units=100,
														
 
															+            langs=["eng", "deu", "fra"],
														
 
															+            model_arch="seamlessM4T_large_v2",
														
 
															+        )
														
 
															+
														
 
															+        encoder = tokenizer.create_encoder(lang="deu", device=device)
														
 
															+
														
 
															+        units = torch.ones((6, 4), device=device, dtype=torch.int64)
														
 
															+
														
 
															+        units[1, 3] = 100
														
 
															+        units[2, 1] = 101
														
 
															+
														
 
															+        token_indices = encoder(units)
														
 
															+
														
 
															+        assert token_indices[1, 3].item() == tokenizer.vocab_info.unk_idx
														
 
															+        assert token_indices[2, 1].item() == tokenizer.vocab_info.unk_idx
														
 
															+
														
 
															+
														
 
															+class TestUnitDecoder:
														
 
															+    def test_call_works(self) -> None:
														
 
															+        tokenizer = UnitTokenizer(
														
 
															+            num_units=100, langs=["eng", "deu", "fra"], model_arch="seamlessM4T_large"
														
 
															+        )
														
 
															+
														
 
															+        encoder = tokenizer.create_encoder(lang="deu", device=device)
														
 
															+        decoder = tokenizer.create_decoder()
														
 
															+
														
 
															+        assert tokenizer.vocab_info.eos_idx is not None
														
 
															+        assert tokenizer.vocab_info.pad_idx is not None
														
 
															+
														
 
															+        units1 = torch.ones((6, 4), device=device, dtype=torch.int64)
														
 
															+
														
 
															+        encoded_units = encoder(units1)
														
 
															+
														
 
															+        encoded_units[2, 2] = tokenizer.vocab_info.eos_idx
														
 
															+
														
 
															+        units2 = decoder(encoded_units)
														
 
															+
														
 
															+        units1[2, 2] = tokenizer.vocab_info.pad_idx
														
 
															+
														
 
															+        prefix = torch.tensor([109], device=device, dtype=torch.int64)
														
 
															+
														
 
															+        assert_equal(torch.cat([prefix.expand(6, -1), units1], dim=1), units2)
														
 
															+
														
 
															+    def test_call_works_nar_decoder(self) -> None:
														
 
															+        tokenizer = UnitTokenizer(
														
 
															+            num_units=100,
														
 
															+            langs=["eng", "deu", "fra"],
														
 
															+            model_arch="seamlessM4T_large_v2",
														
 
															+        )
														
 
															+
														
 
															+        encoder = tokenizer.create_encoder(lang="deu", device=device)
														
 
															+        decoder = tokenizer.create_decoder()
														
 
															+
														
 
															+        assert tokenizer.vocab_info.eos_idx is not None
														
 
															+        assert tokenizer.vocab_info.pad_idx is not None
														
 
															+
														
 
															+        units1 = torch.ones((6, 4), device=device, dtype=torch.int64)
														
 
															+
														
 
															+        encoded_units = encoder(units1)
														
 
															+
														
 
															+        encoded_units[2, 2] = tokenizer.vocab_info.eos_idx
														
 
															+
														
 
															+        units2 = decoder(encoded_units)
														
 
															+
														
 
															+        units1[2, 2] = tokenizer.vocab_info.pad_idx
														
 
															+
														
 
															+        assert_equal(units1, units2)