Преглед изворни кода

Fix pad_idx bugs with M4T v2, streaming, expressivity models. (#181)

* Set _legacy_pad_idx to 1 for NLLB based pos encoder, fix char_pad_idx and _legacy_pad_idx for char pos encoder.

* Remove test_unit_extractor since it's already tested in test_expressivity.
Kaushik Ram Sadagopan пре 1 година
родитељ
комит
57ef6e61db

+ 4 - 5
ggml/test_unity_cpp.py

@@ -380,7 +380,7 @@ def test_StandardConformerEncoderLayer_forward(ctx: Ctx, g_model: c_void_p) -> N
         pytest.skip(reason=f"Folder {DATA_DEV} not found !")
 
     x = torch.load(DATA_DEV / "seqs_before_conformer_block.pt")
-    padding_mask = PaddingMask(torch.ones(1, x.shape[1]),x.shape[1])
+    padding_mask = PaddingMask(torch.ones(1, x.shape[1]), x.shape[1])
 
     layer = pt_model.speech_encoder.inner.layers[0]
     gx = ggml.from_numpy(ctx, x[0])
@@ -545,9 +545,8 @@ def test_WaveformToFbank_forward(ctx: Ctx, g_model: c_void_p) -> None:
 
 def test_PositionalEmbedding_forward(ctx: Ctx, g_model: c_void_p) -> None:
     seq = torch.zeros((4, 20, 1024), dtype=torch.float32)
-    # this _legacy_pad_idx is suspicious. Shouldn't the model use 1 ? But
-    # this is consistent with pt_model.text_decoder_frontend.pos_encoder._sin_offset
-    pos_encoder = fairseq2.nn.SinusoidalPositionEncoder(1024, 55, _legacy_pad_idx=0)
+
+    pos_encoder = fairseq2.nn.SinusoidalPositionEncoder(1024, 55, _legacy_pad_idx=1)
     y_exp = pos_encoder(seq, None)[0].numpy()
 
     gseq = ggml.from_numpy(ctx, seq[0].clone().numpy())
@@ -565,7 +564,7 @@ def test_PositionalEmbedding_forward(ctx: Ctx, g_model: c_void_p) -> None:
 
 def test_PositionalEmbedding_forward_with_cache(ctx: Ctx, g_model: c_void_p) -> None:
     seq = torch.zeros((4, 20, 1024), dtype=torch.float32)
-    pos_encoder = fairseq2.nn.SinusoidalPositionEncoder(1024, 55, _legacy_pad_idx=0)
+    pos_encoder = fairseq2.nn.SinusoidalPositionEncoder(1024, 55, _legacy_pad_idx=1)
     pos_encoder.eval()
     state_bag = fairseq2.nn.IncrementalStateBag(100)
 

+ 1 - 1
src/seamless_communication/cli/m4t/train/model.py

@@ -370,7 +370,7 @@ class ModelBuilder:
             ),
             t2u_config=UnitYT2UConfig(
                 use_gelu=False,
-                char_pad_idx=0,
+                char_pad_idx=1,
                 use_prosody_proj=False,
                 prosody_encoder_dim=0,
                 model_dim=config.model_embed_dim,

+ 2 - 2
src/seamless_communication/models/monotonic_decoder/builder.py

@@ -90,7 +90,7 @@ def _dense_1b() -> MonotonicDecoderConfig:
         model_dim=1024,
         max_seq_len=4096,
         vocab_info=VocabularyInfo(
-            size=256102, unk_idx=1, bos_idx=2, eos_idx=3, pad_idx=1
+            size=256102, unk_idx=1, bos_idx=2, eos_idx=3, pad_idx=0
         ),
         num_decoder_layers=24,
         num_decoder_attn_heads=16,
@@ -164,7 +164,7 @@ class MonotonicDecoderBuilder:
         pos_encoder = SinusoidalPositionEncoder(
             self.config.model_dim,
             self.config.max_seq_len,
-            _legacy_pad_idx=self.config.vocab_info.pad_idx,
+            _legacy_pad_idx=1,
             device=self.device,
         )
 

+ 0 - 2
src/seamless_communication/models/unity/builder.py

@@ -199,8 +199,6 @@ def _expressivity_v2() -> UnitYConfig:
 
     mt_model_config.vocab_info.size = 256102  # NLLB-100
 
-    mt_model_config.vocab_info.pad_idx = 1
-
     mt_model_config.max_seq_len = 10000
 
     t2u_config = unity_t2u_archs.get_config("expressivity_nar")

+ 6 - 4
src/seamless_communication/models/unity/t2u_builder.py

@@ -154,7 +154,7 @@ def _base_t2u() -> UnitYT2UConfig:
         ffn_inner_dim=1024 * 8,
         dropout_p=0.1,
         use_gelu=False,
-        char_pad_idx=0,
+        char_pad_idx=1,
         use_prosody_proj=False,
         prosody_encoder_dim=0,
     )
@@ -177,7 +177,7 @@ def _medium_t2u() -> UnitYT2UConfig:
         ffn_inner_dim=1024 * 8,
         dropout_p=0.1,
         use_gelu=False,
-        char_pad_idx=0,
+        char_pad_idx=1,
         use_prosody_proj=False,
         prosody_encoder_dim=0,
     )
@@ -226,7 +226,7 @@ def _base_nar() -> UnitYT2UConfig:
         ffn_inner_dim=1024 * 8,
         dropout_p=0.0,
         use_gelu=False,
-        char_pad_idx=0,
+        char_pad_idx=1,
         use_prosody_proj=False,
         prosody_encoder_dim=0,
     )
@@ -602,10 +602,12 @@ class UnitYNART2UBuilder:
             self.config.nar_decoder_config.model_name_or_card
         )
 
+        # The legacy pad idx should be the same as that of the unit_pos_encoder,
+        # since in fairseq1 the pos encoder is shared between both char, units.
         char_pos_encoder = SinusoidalPositionEncoder(
             self.config.model_dim,
             self.config.nar_decoder_config.char_max_seq_len,
-            _legacy_pad_idx=self.config.char_pad_idx,
+            _legacy_pad_idx=self.config.target_vocab_info.pad_idx,
             device=self.device,
         )
 

+ 2 - 2
tests/integration/inference/test_translator.py

@@ -55,8 +55,8 @@ def test_seamless_m4t_v2_large_t2tt() -> None:
 def test_seamless_m4t_v2_large_multiple_tasks() -> None:
     model_name = "seamlessM4T_v2_large"
     english_text = "Hello! I hope you're all doing well."
-    ref_spanish_text = "Hola, espero que todos estéis haciendo bien."
-    ref_spanish_asr_text = "Hola, espero que todos estéis haciendo bien."
+    ref_spanish_text = "Hola, espero que todo se esté haciendo bien."
+    ref_spanish_asr_text = "Hola, espero que todo se esté haciendo bien."
 
     dtype = get_default_dtype()
 

+ 0 - 2
tests/integration/models/test_expressivity.py

@@ -13,12 +13,10 @@ from torch import tensor
 from seamless_communication.inference import Translator
 from seamless_communication.inference.pretssel_generator import PretsselGenerator
 from seamless_communication.models.unit_extractor import UnitExtractor
-from seamless_communication.models.unity import load_gcmvn_stats
 from tests.common import (
     assert_unit_close,
     convert_to_collated_fbank,
     device,
-    get_default_dtype,
 )
 
 # fmt: off

+ 0 - 50
tests/integration/models/test_unit_extractor.py

@@ -1,50 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import Final
-
-import torch
-from torch import tensor
-
-from fairseq2.typing import Device
-from seamless_communication.inference import Translator
-from seamless_communication.models.unit_extractor import UnitExtractor
-from tests.common import assert_equal
-
-# fmt: off
-REF_ENG_UNITS: Final = [8976, 8299, 0, 0, 9692, 5395, 785, 785, 7805, 6193, 2922, 4806, 3362, 3560, 8119, 8119, 4335, 205, 5424, 5424, 5064, 7421, 6547, 9952, 3728, 8544, 3321, 1093, 1443, 7962, 3978, 8063, 5168, 5491, 9133, 9275, 5912, 8729, 5097, 5495, 1650, 5048, 2839, 6756, 5665, 4191, 5205, 5205, 9568, 9568, 5932, 1190, 9339, 5839, 5839, 6244, 5320, 3454, 5216, 721, 6994, 6513, 7754, 3469, 296, 1849, 3254, 3254, 5042, 5042, 3961, 2079, 1907, 1846, 661, 2225, 944, 9295, 4712, 1785, 6060, 8701, 7646, 1355, 2876, 8199, 5901, 8199, 3861, 5153, 6420, 2897, 1389, 334, 6334]
-# fmt: on
-
-
-def test_unit_extractor() -> None:
-    model_name = "seamlessM4T_v2_large"
-    english_text = "Hello! I hope you're all doing well."
-
-    # We can't test on the GPU since the output is non-deterministic.
-    device = Device("cpu")
-    dtype = torch.float32
-
-    translator = Translator(model_name, "vocoder_v2", device, dtype=dtype)
-
-    # Generate english speech for the english text.
-    _, speech_output = translator.predict(
-        english_text,
-        "t2st",
-        "eng",
-        src_lang="eng",
-    )
-    assert speech_output is not None
-
-    unit_extractor = UnitExtractor(
-        "xlsr2_1b_v2",
-        "https://dl.fbaipublicfiles.com/seamlessM4T/models/unit_extraction/kmeans_10k.npy",
-        device=device,
-        dtype=torch.float32,
-    )
-
-    units = unit_extractor.predict(speech_output.audio_wavs[0][0], 34)
-
-    assert_equal(units, tensor(REF_ENG_UNITS, device=device, dtype=torch.int64))

+ 1 - 0
tests/integration/models/test_watermarked_vocoder.py

@@ -148,6 +148,7 @@ def test_pretssel_vocoder_watermarking(
     )
 
 
+@pytest.mark.skip(reason="Skip this test since it's extremely slow.")
 def test_e2e_watermark_audio() -> None:
     data_file = "/large_experiments/seamless/data/expressivity/fairseq_manifest/benchmark_20231025/test_examples_20231122.tsv"
     model_name = "seamless_expressivity"