пре 1 година · 57ef6e61db
--- a/ggml/test_unity_cpp.py
+++ b/ggml/test_unity_cpp.py
@@ -380,7 +380,7 @@ def test_StandardConformerEncoderLayer_forward(ctx: Ctx, g_model: c_void_p) -> N
 
				         pytest.skip(reason=f"Folder {DATA_DEV} not found !")
			
 
				 
			
 
				     x = torch.load(DATA_DEV / "seqs_before_conformer_block.pt")
			
 
				-    padding_mask = PaddingMask(torch.ones(1, x.shape[1]),x.shape[1])
			
 
				+    padding_mask = PaddingMask(torch.ones(1, x.shape[1]), x.shape[1])
			
 
				 
			
 
				     layer = pt_model.speech_encoder.inner.layers[0]
			
 
				     gx = ggml.from_numpy(ctx, x[0])
			
@@ -545,9 +545,8 @@ def test_WaveformToFbank_forward(ctx: Ctx, g_model: c_void_p) -> None:
 
				 
			
 
				 def test_PositionalEmbedding_forward(ctx: Ctx, g_model: c_void_p) -> None:
			
 
				     seq = torch.zeros((4, 20, 1024), dtype=torch.float32)
			
 
				-    # this _legacy_pad_idx is suspicious. Shouldn't the model use 1 ? But
			
 
				-    # this is consistent with pt_model.text_decoder_frontend.pos_encoder._sin_offset
			
 
				-    pos_encoder = fairseq2.nn.SinusoidalPositionEncoder(1024, 55, _legacy_pad_idx=0)
			
 
				+
			
 
				+    pos_encoder = fairseq2.nn.SinusoidalPositionEncoder(1024, 55, _legacy_pad_idx=1)
			
 
				     y_exp = pos_encoder(seq, None)[0].numpy()
			
 
				 
			
 
				     gseq = ggml.from_numpy(ctx, seq[0].clone().numpy())
			
@@ -565,7 +564,7 @@ def test_PositionalEmbedding_forward(ctx: Ctx, g_model: c_void_p) -> None:
 
				 
			
 
				 def test_PositionalEmbedding_forward_with_cache(ctx: Ctx, g_model: c_void_p) -> None:
			
 
				     seq = torch.zeros((4, 20, 1024), dtype=torch.float32)
			
 
				-    pos_encoder = fairseq2.nn.SinusoidalPositionEncoder(1024, 55, _legacy_pad_idx=0)
			
 
				+    pos_encoder = fairseq2.nn.SinusoidalPositionEncoder(1024, 55, _legacy_pad_idx=1)
			
 
				     pos_encoder.eval()
			
 
				     state_bag = fairseq2.nn.IncrementalStateBag(100)
			
 
				 
			
--- a/src/seamless_communication/cli/m4t/train/model.py
+++ b/src/seamless_communication/cli/m4t/train/model.py
@@ -370,7 +370,7 @@ class ModelBuilder:
 
				             ),
			
 
				             t2u_config=UnitYT2UConfig(
			
 
				                 use_gelu=False,
			
 
				-                char_pad_idx=0,
			
 
				+                char_pad_idx=1,
			
 
				                 use_prosody_proj=False,
			
 
				                 prosody_encoder_dim=0,
			
 
				                 model_dim=config.model_embed_dim,
			
--- a/src/seamless_communication/models/monotonic_decoder/builder.py
+++ b/src/seamless_communication/models/monotonic_decoder/builder.py
@@ -90,7 +90,7 @@ def _dense_1b() -> MonotonicDecoderConfig:
 
				         model_dim=1024,
			
 
				         max_seq_len=4096,
			
 
				         vocab_info=VocabularyInfo(
			
 
				-            size=256102, unk_idx=1, bos_idx=2, eos_idx=3, pad_idx=1
			
 
				+            size=256102, unk_idx=1, bos_idx=2, eos_idx=3, pad_idx=0
			
 
				         ),
			
 
				         num_decoder_layers=24,
			
 
				         num_decoder_attn_heads=16,
			
@@ -164,7 +164,7 @@ class MonotonicDecoderBuilder:
 
				         pos_encoder = SinusoidalPositionEncoder(
			
 
				             self.config.model_dim,
			
 
				             self.config.max_seq_len,
			
 
				-            _legacy_pad_idx=self.config.vocab_info.pad_idx,
			
 
				+            _legacy_pad_idx=1,
			
 
				             device=self.device,
			
 
				         )
			
 
				 
			
--- a/src/seamless_communication/models/unity/builder.py
+++ b/src/seamless_communication/models/unity/builder.py
@@ -199,8 +199,6 @@ def _expressivity_v2() -> UnitYConfig:
 
				 
			
 
				     mt_model_config.vocab_info.size = 256102  # NLLB-100
			
 
				 
			
 
				-    mt_model_config.vocab_info.pad_idx = 1
			
 
				-
			
 
				     mt_model_config.max_seq_len = 10000
			
 
				 
			
 
				     t2u_config = unity_t2u_archs.get_config("expressivity_nar")
			
--- a/src/seamless_communication/models/unity/t2u_builder.py
+++ b/src/seamless_communication/models/unity/t2u_builder.py
@@ -154,7 +154,7 @@ def _base_t2u() -> UnitYT2UConfig:
 
				         ffn_inner_dim=1024 * 8,
			
 
				         dropout_p=0.1,
			
 
				         use_gelu=False,
			
 
				-        char_pad_idx=0,
			
 
				+        char_pad_idx=1,
			
 
				         use_prosody_proj=False,
			
 
				         prosody_encoder_dim=0,
			
 
				     )
			
@@ -177,7 +177,7 @@ def _medium_t2u() -> UnitYT2UConfig:
 
				         ffn_inner_dim=1024 * 8,
			
 
				         dropout_p=0.1,
			
 
				         use_gelu=False,
			
 
				-        char_pad_idx=0,
			
 
				+        char_pad_idx=1,
			
 
				         use_prosody_proj=False,
			
 
				         prosody_encoder_dim=0,
			
 
				     )
			
@@ -226,7 +226,7 @@ def _base_nar() -> UnitYT2UConfig:
 
				         ffn_inner_dim=1024 * 8,
			
 
				         dropout_p=0.0,
			
 
				         use_gelu=False,
			
 
				-        char_pad_idx=0,
			
 
				+        char_pad_idx=1,
			
 
				         use_prosody_proj=False,
			
 
				         prosody_encoder_dim=0,
			
 
				     )
			
@@ -602,10 +602,12 @@ class UnitYNART2UBuilder:
 
				             self.config.nar_decoder_config.model_name_or_card
			
 
				         )
			
 
				 
			
 
				+        # The legacy pad idx should be the same as that of the unit_pos_encoder,
			
 
				+        # since in fairseq1 the pos encoder is shared between both char, units.
			
 
				         char_pos_encoder = SinusoidalPositionEncoder(
			
 
				             self.config.model_dim,
			
 
				             self.config.nar_decoder_config.char_max_seq_len,
			
 
				-            _legacy_pad_idx=self.config.char_pad_idx,
			
 
				+            _legacy_pad_idx=self.config.target_vocab_info.pad_idx,
			
 
				             device=self.device,
			
 
				         )
			
 
				 
			
--- a/tests/integration/inference/test_translator.py
+++ b/tests/integration/inference/test_translator.py
@@ -55,8 +55,8 @@ def test_seamless_m4t_v2_large_t2tt() -> None:
 
				 def test_seamless_m4t_v2_large_multiple_tasks() -> None:
			
 
				     model_name = "seamlessM4T_v2_large"
			
 
				     english_text = "Hello! I hope you're all doing well."
			
 
				-    ref_spanish_text = "Hola, espero que todos estéis haciendo bien."
			
 
				-    ref_spanish_asr_text = "Hola, espero que todos estéis haciendo bien."
			
 
				+    ref_spanish_text = "Hola, espero que todo se esté haciendo bien."
			
 
				+    ref_spanish_asr_text = "Hola, espero que todo se esté haciendo bien."
			
 
				 
			
 
				     dtype = get_default_dtype()
			
 
				 
			
--- a/tests/integration/models/test_expressivity.py
+++ b/tests/integration/models/test_expressivity.py
@@ -13,12 +13,10 @@ from torch import tensor
 
				 from seamless_communication.inference import Translator
			
 
				 from seamless_communication.inference.pretssel_generator import PretsselGenerator
			
 
				 from seamless_communication.models.unit_extractor import UnitExtractor
			
 
				-from seamless_communication.models.unity import load_gcmvn_stats
			
 
				 from tests.common import (
			
 
				     assert_unit_close,
			
 
				     convert_to_collated_fbank,
			
 
				     device,
			
 
				-    get_default_dtype,
			
 
				 )
			
 
				 
			
 
				 # fmt: off
			
--- a/tests/integration/models/test_unit_extractor.py
+++ b/tests/integration/models/test_unit_extractor.py
@@ -1,50 +0,0 @@
 
				-# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				-# All rights reserved.
			
 
				-#
			
 
				-# This source code is licensed under the license found in the
			
 
				-# LICENSE file in the root directory of this source tree.
			
 
				-
			
 
				-from typing import Final
			
 
				-
			
 
				-import torch
			
 
				-from torch import tensor
			
 
				-
			
 
				-from fairseq2.typing import Device
			
 
				-from seamless_communication.inference import Translator
			
 
				-from seamless_communication.models.unit_extractor import UnitExtractor
			
 
				-from tests.common import assert_equal
			
 
				-
			
 
				-# fmt: off
			
 
				-REF_ENG_UNITS: Final = [8976, 8299, 0, 0, 9692, 5395, 785, 785, 7805, 6193, 2922, 4806, 3362, 3560, 8119, 8119, 4335, 205, 5424, 5424, 5064, 7421, 6547, 9952, 3728, 8544, 3321, 1093, 1443, 7962, 3978, 8063, 5168, 5491, 9133, 9275, 5912, 8729, 5097, 5495, 1650, 5048, 2839, 6756, 5665, 4191, 5205, 5205, 9568, 9568, 5932, 1190, 9339, 5839, 5839, 6244, 5320, 3454, 5216, 721, 6994, 6513, 7754, 3469, 296, 1849, 3254, 3254, 5042, 5042, 3961, 2079, 1907, 1846, 661, 2225, 944, 9295, 4712, 1785, 6060, 8701, 7646, 1355, 2876, 8199, 5901, 8199, 3861, 5153, 6420, 2897, 1389, 334, 6334]
			
 
				-# fmt: on
			
 
				-
			
 
				-
			
 
				-def test_unit_extractor() -> None:
			
 
				-    model_name = "seamlessM4T_v2_large"
			
 
				-    english_text = "Hello! I hope you're all doing well."
			
 
				-
			
 
				-    # We can't test on the GPU since the output is non-deterministic.
			
 
				-    device = Device("cpu")
			
 
				-    dtype = torch.float32
			
 
				-
			
 
				-    translator = Translator(model_name, "vocoder_v2", device, dtype=dtype)
			
 
				-
			
 
				-    # Generate english speech for the english text.
			
 
				-    _, speech_output = translator.predict(
			
 
				-        english_text,
			
 
				-        "t2st",
			
 
				-        "eng",
			
 
				-        src_lang="eng",
			
 
				-    )
			
 
				-    assert speech_output is not None
			
 
				-
			
 
				-    unit_extractor = UnitExtractor(
			
 
				-        "xlsr2_1b_v2",
			
 
				-        "https://dl.fbaipublicfiles.com/seamlessM4T/models/unit_extraction/kmeans_10k.npy",
			
 
				-        device=device,
			
 
				-        dtype=torch.float32,
			
 
				-    )
			
 
				-
			
 
				-    units = unit_extractor.predict(speech_output.audio_wavs[0][0], 34)
			
 
				-
			
 
				-    assert_equal(units, tensor(REF_ENG_UNITS, device=device, dtype=torch.int64))
			
--- a/tests/integration/models/test_watermarked_vocoder.py
+++ b/tests/integration/models/test_watermarked_vocoder.py
@@ -148,6 +148,7 @@ def test_pretssel_vocoder_watermarking(
 
				     )
			
 
				 
			
 
				 
			
 
				+@pytest.mark.skip(reason="Skip this test since it's extremely slow.")
			
 
				 def test_e2e_watermark_audio() -> None:
			
 
				     data_file = "/large_experiments/seamless/data/expressivity/fairseq_manifest/benchmark_20231025/test_examples_20231122.tsv"
			
 
				     model_name = "seamless_expressivity"