hai 1 ano · 9f6ade6ee4
--- a/scripts/convert_pretssel_hifigan_chkpt.py
+++ b/scripts/convert_pretssel_hifigan_chkpt.py
@@ -9,10 +9,9 @@ out_channels -> upsample_initial_channel
 
				 """
			
 
				 
			
 
				 
			
 
				-def main():
			
 
				-    chkpt_root = "/checkpoint/mjhwang/experiments/231007-mel_vocoder-mls_multilingual_6lang/train_mls_multilingual_6lang_subset_hifigan.v1_8gpu_adapt"
			
 
				-    cfg = f"{chkpt_root}/config.yml"
			
 
				-    # TODO: display cfg
			
 
				+def main() -> None:
			
 
				+    # chkpt_root = "/checkpoint/mjhwang/experiments/231007-mel_vocoder-mls_multilingual_6lang/train_mls_multilingual_6lang_subset_hifigan.v1_8gpu_adapt"
			
 
				+    chkpt_root = "/checkpoint/mjhwang/experiments/231112-mel_vocoder-ai_speech_24khz/train_train_highquality_speech_20231111_no16khz_100000_hifigan.v1_8gpu_adapt"
			
 
				     chkpt = torch.load(f"{chkpt_root}/checkpoint-400000steps.pkl")
			
 
				     del chkpt["model"]["discriminator"]
			
 
				     conv_seq_map = {
			
@@ -21,7 +20,7 @@ def main():
 
				         ".1.weight_v": ".weight_v",
			
 
				     }
			
 
				 
			
 
				-    def update_key(k):
			
 
				+    def update_key(k: str) -> str:
			
 
				         if k.startswith("input_conv"):
			
 
				             k = k.replace("input_conv", "conv_pre")
			
 
				         elif k.startswith("upsamples"):
			
@@ -50,7 +49,8 @@ def main():
 
				     for k in ["optimizer", "scheduler", "steps", "epochs"]:
			
 
				         del chkpt[k]
			
 
				 
			
 
				-    out_path = "/large_experiments/seamless/ust/changhan/checkpoints/fairseq2/pretssel_hifigan.pt"
			
 
				+    # out_path = "/large_experiments/seamless/ust/changhan/checkpoints/fairseq2/pretssel_hifigan.pt"
			
 
				+    out_path = "/large_experiments/seamless/workstream/expressivity/oss/checkpoints/melhifigan_20231121.pt"
			
 
				     torch.save(chkpt, out_path)
			
 
				 
			
 
				 
			
--- a/scripts/watermarking/compile_chkpt.py
+++ b/scripts/watermarking/compile_chkpt.py
@@ -0,0 +1,208 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+#
			
 
				+#
			
 
				+# The rules to blend the p2v decoder, mel-vocoder and the watermarking:
			
 
				+#
			
 
				+# Step 1) Make the big sequential module `layers` that consists of:
			
 
				+#    - PostNet (last layer of the p2v decoder) : 5 layers
			
 
				+#    - mel-vocoder layers (conv_pre, ups, resblocks, conv_post): 18 layers
			
 
				+#    - watermarking encoder and decoder: 32 layers
			
 
				+#
			
 
				+# Step 2) Take the last 32 layers of the watermarking, split into 4 blocks of
			
 
				+# 8 layers. Mix these blocks into the previous layers
			
 
				+#
			
 
				+# The final mixed architecture SPVM (Spaghetti Pretssel Vovoder Model):
			
 
				+#
			
 
				+#     <P2V: Post Net>
			
 
				+#           |
			
 
				+# <Block 1 of Watermarker> ------
			
 
				+#           |                   |
			
 
				+#          \/                   |
			
 
				+#  <Melvocoder: Conv_pre>       |
			
 
				+#           | (skip)            |
			
 
				+# <Block 2 of Watermarker> -----|
			
 
				+#           |                   |
			
 
				+#          \/                   |
			
 
				+# <Melvocoder: Upsampler>       |
			
 
				+#           | (skip)            |
			
 
				+# <Block 3 of Watermarker> -----|
			
 
				+#           |                   |
			
 
				+#          \/                   |
			
 
				+# <Melvocoder: Resblocks>       |
			
 
				+#           | (skip)            |
			
 
				+# <Block 4 of Watermarker> -----|
			
 
				+#           |                   |
			
 
				+#          \/                   |
			
 
				+#  <Melvocoder: Conv_post>      |
			
 
				+#           |                   |
			
 
				+#           | ------------------|
			
 
				+#           |
			
 
				+#          \/
			
 
				+#    watermarked wavs
			
 
				+
			
 
				+from pathlib import Path
			
 
				+from argparse import ArgumentParser
			
 
				+from typing import Any, Mapping, Match
			
 
				+
			
 
				+import torch
			
 
				+from fairseq2.models.utils.checkpoint import (
			
 
				+    convert_fairseq_checkpoint,
			
 
				+    convert_model_state_dict,
			
 
				+    load_checkpoint,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+def pretssel_key_map() -> Mapping[str, str]:
			
 
				+    """
			
 
				+    The rule for renaming the layers of Pretssel model checkpoint:
			
 
				+        - Merge decoder.postnet into `layers`
			
 
				+    """
			
 
				+    from seamless_communication.models.pretssel.loader import _fairseq_key_map  # noqa
			
 
				+
			
 
				+    key_map = _fairseq_key_map(None)  # type: ignore[arg-type]
			
 
				+    del key_map[r"^decoder\.postnet\."]
			
 
				+    key_map[r"^decoder\.postnet\.convolutions\."] = r"layers."
			
 
				+    return key_map
			
 
				+
			
 
				+
			
 
				+def vocoder_key_map() -> Mapping[str, Any]:
			
 
				+    """
			
 
				+    Rename layers in the mel-vocoder checkpoint. We flatten the vocoder arch and put everything
			
 
				+    into the `layers`, in which `postnet_size` layers from the PostNet already present. In other words:
			
 
				+        - conv_pre -> layers.<postnet_size + watermark / 4>
			
 
				+        - ups.i -> layers.<postnet_size + 1 + i + watermark_size / 2>
			
 
				+        - resblocks.i -> layers.<postnet_size + 1 + ups_size + i + 3 * watermark_size / 4>
			
 
				+        - conv_post.i -> layers.<postnet_size + 1 + ups_size + resblocks_size + i + watermark_size>
			
 
				+    """
			
 
				+
			
 
				+    return {
			
 
				+        # fmt: off
			
 
				+        # postnet_size = 5, 1st wm block = 8 -> 13
			
 
				+        r"^conv_pre\.":               r"layers.13.",                                 # noqa, 2nd wm block = 8 -> +8
			
 
				+        r"^ups\.([0-9]+)\.":          lambda x: f"layers.{int(x.group(1)) + 22}.",   # noqa, ups_size = 4, 3rd wm block = 8 -> +12
			
 
				+        r"^resblocks\.([0-9]+)\.":    lambda x: f"layers.{int(x.group(1)) + 34}.",   # noqa, resblocks_size = 12, 4th wm block = 8 -> +20
			
 
				+        r"^conv_post\.":              r"layers.54.",
			
 
				+        # fmt: on
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def wm_key_map() -> Mapping[Any, Any]:
			
 
				+    """
			
 
				+    flatten all encoders and decoders into the one sequential layer (step 1) and split the watermaker
			
 
				+    into 4 blocks and mix into the layers of the p2v decoder and mel-vocoder:
			
 
				+        - encoder.model.[0-7] --> layers.<postnet_size + i> (5 --> 12)
			
 
				+        - encoder.model.[8-15] --> layers.<postnet_size + 9> (14 --> 21)
			
 
				+        - decoder.model.[0-7] --> layers.<postnet_size + vocoder_ups_size + conv_pre + 16> (26 -> 33)
			
 
				+        - decoder.model.[8-15] --> layers.<postnet_size + vocoder_ups_size + conv_pre + resblock_size + 24> (46 -> 53)
			
 
				+    """
			
 
				+
			
 
				+    def encoder_layer_index(match_obj: Match[str]) -> str:
			
 
				+        idx = int(match_obj.group(1))
			
 
				+        # First half of the encoder is after the PostNet
			
 
				+        if idx < 8:
			
 
				+            # postnet_size = 5
			
 
				+            return f"layers.{idx + 5}."
			
 
				+
			
 
				+        # Second half of the encoder goes after the mel-vocoder:conv_pre
			
 
				+        else:
			
 
				+            # postnet = 5, conv_pre = 1 --> +6
			
 
				+            return f"layers.{idx + 6}."
			
 
				+
			
 
				+    def decoder_layer_index(match_obj: Match[str]) -> str:
			
 
				+        idx = int(match_obj.group(1))
			
 
				+        # First half of the decoder is after the mel-vocoder:ups
			
 
				+        if idx < 8:
			
 
				+            # postnet 5, conv_pre 1, encoder 16, ups 4 --> +26
			
 
				+            return f"layers.{idx + 26}."
			
 
				+        else:
			
 
				+            # postnet 5, conv_pre 1, encoder 16, ups 4, resblock 12 -> +38
			
 
				+            return f"layers.{idx + 38}."
			
 
				+
			
 
				+    return {
			
 
				+        r"^encoder\.model\.([0-9]+)\.": encoder_layer_index,
			
 
				+        r"^decoder\.model\.([0-9]+)\.": decoder_layer_index,
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def combine_chkpts(pretssel_file: str, vocoder_file: str, out_path: str) -> None:
			
 
				+    """Combine the pretssel and melhifigan into one model"""
			
 
				+    pretssel_chkpt = load_checkpoint(pretssel_file)
			
 
				+    pretssel_chkpt = convert_fairseq_checkpoint(pretssel_chkpt, pretssel_key_map())
			
 
				+
			
 
				+    vocoder_chkpt = load_checkpoint(vocoder_file)
			
 
				+    vocoder_chkpt = convert_fairseq_checkpoint(vocoder_chkpt, vocoder_key_map())
			
 
				+
			
 
				+    wm_ckpt = load_checkpoint(
			
 
				+        "/large_experiments/seamless/nllb/watermarking/checkpoints/ckpt_e9d0008c.th",
			
 
				+    )
			
 
				+    # wm_ckpt is not a fairseq2 checkpoint so we have to handle it differently
			
 
				+    wm_ckpt = convert_model_state_dict(wm_ckpt, wm_key_map())
			
 
				+
			
 
				+    # Merge the state dicts
			
 
				+    ckpt = pretssel_chkpt
			
 
				+    state_dict = ckpt["model"]
			
 
				+    for vocoder_key in vocoder_chkpt["model"]:
			
 
				+        state_dict[vocoder_key] = vocoder_chkpt["model"][vocoder_key]
			
 
				+
			
 
				+    for wm_key, wm_val in wm_ckpt.items():
			
 
				+        if wm_key.startswith("layers."):
			
 
				+            state_dict[wm_key] = wm_val
			
 
				+
			
 
				+    # Remove obsolete layers
			
 
				+    keys_to_delete = [
			
 
				+        "encoder.embed_positions._float_tensor",
			
 
				+        "decoder.embed_positions._float_tensor",
			
 
				+        "enc_emb_proj.weight",
			
 
				+        "enc_emb_proj.bias",
			
 
				+    ]
			
 
				+    keys_to_delete.extend(
			
 
				+        [
			
 
				+            key
			
 
				+            for key in state_dict
			
 
				+            if key.startswith("decoder.var_adaptor.duration_predictor")
			
 
				+        ]
			
 
				+    )
			
 
				+    for key in keys_to_delete:
			
 
				+        if key in state_dict:
			
 
				+            del state_dict[key]
			
 
				+
			
 
				+    out_path = "/large_experiments/seamless/workstream/expressivity/oss/checkpoints/pretssel_melhifigan_wm-final.pt"
			
 
				+    model_mapping_metafile = Path(out_path).with_suffix(".arch")
			
 
				+    with open(model_mapping_metafile, "w", encoding="utf-8") as o:
			
 
				+        o.write(vocoder_key_map.__doc__)  # type: ignore
			
 
				+        o.write("\n")
			
 
				+        o.write(wm_key_map.__doc__)  # type: ignore
			
 
				+        o.write("\n")
			
 
				+    torch.save(ckpt, out_path)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # fmt: off
			
 
				+    parser = ArgumentParser(description="Compile watermarking into p2v decoder and vocoder")
			
 
				+    parser.add_argument(
			
 
				+        "--pretssel",
			
 
				+        default="/checkpoint/mjhwang/experiments/230930-noiseaug_p2v-mls_multilingual_6lang/231005-noiseaug_p2v-mls_multilingual_6lang-alignfix.config_v2.langemb1.vuv_logit1.denoise.ngpu16/checkpoint_best.pt",
			
 
				+        type=str,
			
 
				+        help="Path to the Pretssel model checkpoint",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--vocoder",
			
 
				+        # default="/large_experiments/seamless/ust/changhan/checkpoints/fairseq2/pretssel_hifigan.pt",
			
 
				+        default="/large_experiments/seamless/workstream/expressivity/oss/checkpoints/melhifigan_20231121.pt",
			
 
				+        type=str,
			
 
				+        help="Path to the mel-vocoder checkpoint",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--output",
			
 
				+        default="/large_experiments/seamless/workstream/expressivity/oss/checkpoints/pretssel_melhifigan_wm-final.pt",
			
 
				+        # default="/large_experiments/seamless/workstream/expressivity/oss/checkpoints/pretssel_melhifigan_wm-20231121.pt",
			
 
				+        type=str,
			
 
				+        help="Path to the output watermarked model checkpoint",
			
 
				+    )
			
 
				+    # fmt: on
			
 
				+    args = parser.parse_args()
			
 
				+    combine_chkpts(args.pretssel, args.vocoder, args.output)
			
--- a/scripts/watermarking/seamlesswatermark.yaml
+++ b/scripts/watermarking/seamlesswatermark.yaml
@@ -0,0 +1,44 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the BSD-style license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+
			
 
				+name: seamlesswatermark
			
 
				+model_type: seanet
			
 
				+checkpoint: "/large_experiments/seamless/nllb/watermarking/checkpoints/ckpt_e9d0008c.th"
			
 
				+watermarker_model:
			
 
				+  channels: 1
			
 
				+  sample_rate: 16000
			
 
				+seanet:
			
 
				+  activation: ELU
			
 
				+  activation_params:
			
 
				+    alpha: 1.0
			
 
				+  causal: false
			
 
				+  channels: 1
			
 
				+  compress: 2
			
 
				+  decoder:
			
 
				+    final_activation: null
			
 
				+    final_activation_params: null
			
 
				+    trim_right_ratio: 1.0
			
 
				+  detector: {}
			
 
				+  dilation_base: 2
			
 
				+  dimension: 128
			
 
				+  disable_norm_outer_blocks: 0
			
 
				+  encoder: {}
			
 
				+  kernel_size: 7
			
 
				+  last_kernel_size: 7
			
 
				+  lstm: 2
			
 
				+  n_filters: 32
			
 
				+  n_residual_layers: 1
			
 
				+  norm: weight_norm
			
 
				+  norm_params: {}
			
 
				+  pad_mode: constant
			
 
				+  ratios:
			
 
				+  - 8
			
 
				+  - 5
			
 
				+  - 4
			
 
				+  - 2
			
 
				+  residual_kernel_size: 3
			
 
				+  true_skip: true
			
--- a/scripts/watermarking/watermarking.py
+++ b/scripts/watermarking/watermarking.py
@@ -0,0 +1,236 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+# The original implementation for the watermarking
			
 
				+# This is not open-sourced and only kept here for future reference
			
 
				+# mypy: ignore-errors
			
 
				+
			
 
				+import math
			
 
				+from argparse import ArgumentParser, ArgumentTypeError
			
 
				+from pathlib import Path
			
 
				+from typing import Any, Dict, Union, cast
			
 
				+
			
 
				+import audiocraft
			
 
				+import omegaconf
			
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+import torchaudio
			
 
				+from audiocraft.modules.seanet import SEANetEncoder
			
 
				+from audiocraft.utils.utils import dict_from_config
			
 
				+from fairseq2.typing import DataType, Device
			
 
				+
			
 
				+
			
 
				+class SEANetEncoderKeepDimension(SEANetEncoder):
			
 
				+    """
			
 
				+    similar architecture to the SEANet encoder but with an extra step that
			
 
				+    projects the output dimension to the same input dimension by repeating
			
 
				+    the sequential
			
 
				+
			
 
				+    Args:
			
 
				+        SEANetEncoder (_type_): _description_
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, output_hidden_dim: int = 8, *args, **kwargs):  # type: ignore
			
 
				+        super().__init__(*args, **kwargs)
			
 
				+        self.output_hidden_dim = output_hidden_dim
			
 
				+        # Adding a reverse convolution layer
			
 
				+        self.reverse_convolution = nn.ConvTranspose1d(
			
 
				+            in_channels=self.dimension,
			
 
				+            out_channels=self.output_hidden_dim,
			
 
				+            kernel_size=math.prod(self.ratios),
			
 
				+            stride=math.prod(self.ratios),
			
 
				+            padding=0,
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        orig_nframes = x.shape[-1]
			
 
				+        x = self.model(x)
			
 
				+        x = self.reverse_convolution(x)
			
 
				+        # make sure dim didn't change
			
 
				+        x = x[:, :, :orig_nframes]
			
 
				+        return x
			
 
				+
			
 
				+
			
 
				+class Watermarker(nn.Module):
			
 
				+    """
			
 
				+    Initialize the Watermarker model.
			
 
				+
			
 
				+    Args:
			
 
				+        encoder (nn.Module): Watermark Encoder.
			
 
				+        decoder (nn.Module): Watermark Decoder.
			
 
				+        detector (nn.Module): Watermark Detector.
			
 
				+        sample_rate (int): Audio sample rate.
			
 
				+        channels (int): Number of audio channels.
			
 
				+    """
			
 
				+
			
 
				+    sample_rate: int = 0
			
 
				+    channels: int = 0
			
 
				+    encoder: SEANetEncoder
			
 
				+    decoder: SEANetEncoder
			
 
				+    detector: SEANetEncoderKeepDimension
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        encoder: SEANetEncoder,
			
 
				+        decoder: SEANetEncoder,
			
 
				+        detector: SEANetEncoderKeepDimension,
			
 
				+        sample_rate: int,
			
 
				+        channels: int,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.encoder = encoder
			
 
				+        self.decoder = decoder
			
 
				+        self.detector = detector
			
 
				+        self.sample_rate = sample_rate
			
 
				+        self.channels = channels
			
 
				+
			
 
				+    def get_watermark(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        """
			
 
				+        Get the watermark from a batch of audio input.
			
 
				+
			
 
				+        Args:
			
 
				+            x (torch.Tensor): Input audio tensor with dimensions [batch size, channels = 1, frames].
			
 
				+
			
 
				+        Returns:
			
 
				+            torch.Tensor: Output watermark with the same dimensionality as the input.
			
 
				+        """
			
 
				+        hidden = self.encoder(x)
			
 
				+        # assert dim in = dim out
			
 
				+        watermark = self.decoder(hidden)[:, :, : x.size(-1)]
			
 
				+        return watermark
			
 
				+
			
 
				+    def detect_watermark(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        """
			
 
				+        Detect the watermark in a batch of audio input.
			
 
				+
			
 
				+        Args:
			
 
				+            x (torch.Tensor): Input audio tensor with dimensions
			
 
				+            [batch size, channels = 1, frames].
			
 
				+
			
 
				+        Returns:
			
 
				+            torch.Tensor: Predictions of the classifier for watermark
			
 
				+            with dimensions [bsz, classes = 2, frames].
			
 
				+            For each frame, the detector outputs probabilities of
			
 
				+            non-watermarked class (class id 0) and
			
 
				+            the probability of "watermarked" class (class id 1).
			
 
				+            To do inference, you can use output[:, 1, :]
			
 
				+            to get probabilities of input audio being watermarked.
			
 
				+        """
			
 
				+        return self.detector(x)
			
 
				+
			
 
				+
			
 
				+def model_from_checkpoint(
			
 
				+    checkpoint_path: Union[Path, str] = Path(__file__).parent
			
 
				+    / "seamlesswatermark.yaml",
			
 
				+    device: Union[torch.device, str] = "cpu",
			
 
				+    dtype: DataType = torch.float32,
			
 
				+) -> Watermarker:
			
 
				+    """Instantiate a Watermarker model from a given checkpoint path.
			
 
				+
			
 
				+    Example usage:
			
 
				+    >>> from watermarking.watermarking import *
			
 
				+    >>> cfg = "seamlesswatermark.yaml"
			
 
				+    >>> url = "https://keithito.com/LJ-Speech-Dataset/LJ037-0171.wav"
			
 
				+    >>> urllib.request.urlretrieve(url, "random.wav")
			
 
				+    >>> wav, _ = torchaudio.load("random.wav")
			
 
				+    >>> wav = wav.unsqueeze(0)  # add bsz dimension
			
 
				+
			
 
				+    # code starts here
			
 
				+    >>> model = model_from_checkpoint(cfg, device = wav.device)
			
 
				+
			
 
				+    >>> watermark = model.get_watermark(wav)
			
 
				+
			
 
				+    >>> watermarked_audio = wav + watermark
			
 
				+    >>> detection = model.detect_watermark(watermarked_audio)
			
 
				+    >>> print(detection[:,1,:])  # print prob of watermarked class # should be > 0.5
			
 
				+
			
 
				+    >>> detection = model.detect_watermark(wav)
			
 
				+    >>> print(detection[:,1,:])  # print prob of watermarked class  # should be < 0.5
			
 
				+
			
 
				+    Args:
			
 
				+        checkpoint_path (Path or str): Path to the checkpoint file.
			
 
				+        device (torch.device or str, optional): Device on which
			
 
				+        the model is loaded (default is "cpu").
			
 
				+
			
 
				+    Returns:
			
 
				+        Watermarker: An instance of the Watermarker model loaded from the checkpoint.
			
 
				+    """
			
 
				+    cfg = omegaconf.OmegaConf.load(checkpoint_path)
			
 
				+    state: Dict[str, Any] = torch.load(cfg["checkpoint"])
			
 
				+    watermarking_model = get_watermarking_model(cfg)
			
 
				+    watermarking_model.load_state_dict(state)
			
 
				+    watermarking_model = watermarking_model.to(device, dtype=dtype)
			
 
				+    watermarking_model.eval()
			
 
				+    return watermarking_model
			
 
				+
			
 
				+
			
 
				+def get_watermarking_model(cfg: omegaconf.DictConfig) -> Watermarker:
			
 
				+    kwargs = dict_from_config(getattr(cfg, "watermarker_model"))
			
 
				+    encoder, decoder = get_encodec_autoencoder(cfg)
			
 
				+    detector = get_detector(cfg)
			
 
				+    return Watermarker(encoder, decoder, detector, **kwargs)
			
 
				+
			
 
				+
			
 
				+def get_encodec_autoencoder(cfg: omegaconf.DictConfig):
			
 
				+    kwargs = dict_from_config(getattr(cfg, "seanet"))
			
 
				+    if hasattr(cfg.seanet, "detector"):
			
 
				+        kwargs.pop("detector")
			
 
				+    encoder_override_kwargs = kwargs.pop("encoder")
			
 
				+    decoder_override_kwargs = kwargs.pop("decoder")
			
 
				+    encoder_kwargs = {**kwargs, **encoder_override_kwargs}
			
 
				+    decoder_kwargs = {**kwargs, **decoder_override_kwargs}
			
 
				+    encoder = audiocraft.modules.SEANetEncoder(**encoder_kwargs)
			
 
				+    decoder = audiocraft.modules.SEANetDecoder(**decoder_kwargs)
			
 
				+    return encoder, decoder
			
 
				+
			
 
				+
			
 
				+def get_detector(cfg: omegaconf.DictConfig):
			
 
				+    kwargs = dict_from_config(getattr(cfg, "seanet"))
			
 
				+    encoder_override_kwargs = kwargs.pop("detector")
			
 
				+    kwargs.pop("decoder")
			
 
				+    kwargs.pop("encoder")
			
 
				+    encoder_kwargs = {**kwargs, **encoder_override_kwargs}
			
 
				+    output_hidden_dim = 8
			
 
				+    encoder = SEANetEncoderKeepDimension(output_hidden_dim, **encoder_kwargs)
			
 
				+
			
 
				+    last_layer = torch.nn.Conv1d(output_hidden_dim, 2, 1)
			
 
				+    softmax = torch.nn.Softmax(dim=1)
			
 
				+    detector = torch.nn.Sequential(encoder, last_layer, softmax)
			
 
				+    return detector
			
 
				+
			
 
				+
			
 
				+def parse_device_arg(value: str) -> Device:
			
 
				+    try:
			
 
				+        return Device(value)
			
 
				+    except RuntimeError:
			
 
				+        raise ArgumentTypeError(f"'{value}' is not a valid device name.")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    """
			
 
				+    Example usage:
			
 
				+    python watermarking.py --device cuda:0 detect [file.wav]
			
 
				+    """
			
 
				+    parser = ArgumentParser(description="Handle the watermarking for audios")
			
 
				+    parser.add_argument(
			
 
				+        "--device",
			
 
				+        default="cpu",
			
 
				+        type=parse_device_arg,
			
 
				+        help="device on which to run tests (default: %(default)s)",
			
 
				+    )
			
 
				+    sub_parser = parser.add_subparsers(title="actions", dest="sub_cmd")
			
 
				+    detect_parser = sub_parser.add_parser("detect")
			
 
				+    wm_parser = sub_parser.add_parser("wm")
			
 
				+    parser.add_argument("file", type=str, help="Path to the .wav file")
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    if args.sub_cmd == "detect":
			
 
				+        model = model_from_checkpoint(device=args.device)
			
 
				+        wav, _ = torchaudio.load(args.file)
			
 
				+        wav = wav.unsqueeze(0)
			
 
				+        wav = wav.to(args.device)
			
 
				+        detection = model.detect_watermark(wav)
			
 
				+        print(detection[:, 1, :])
			
--- a/src/seamless_communication/cards/vocoder_pretssel.yaml
+++ b/src/seamless_communication/cards/vocoder_pretssel.yaml
@@ -0,0 +1,182 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the BSD-style license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+name: vocoder_pretssel
			
 
				+model_type: vocoder_pretssel
			
 
				+model_arch: 24khz
			
 
				+checkpoint: "file:///large_experiments/seamless/workstream/expressivity/oss/checkpoints/pretssel_melhifigan_wm-final.pt"
			
 
				+sample_rate: 24000
			
 
				+model_config:
			
 
				+  langs:
			
 
				+    - cmn
			
 
				+    - deu
			
 
				+    - eng
			
 
				+    - fra
			
 
				+    - ita
			
 
				+    - spa
			
 
				+  gcmvn_stats:
			
 
				+    mean:
			
 
				+      - 9.023406257490224
			
 
				+      - 9.406622923058864
			
 
				+      - 10.554165334059368
			
 
				+      - 11.475190058682356
			
 
				+      - 12.179117104099705
			
 
				+      - 12.603782921407062
			
 
				+      - 12.769632747861747
			
 
				+      - 12.714276772934083
			
 
				+      - 12.747612172560233
			
 
				+      - 12.750373688097946
			
 
				+      - 12.948050207790237
			
 
				+      - 13.121829398704277
			
 
				+      - 13.40130828476734
			
 
				+      - 13.58028050886195
			
 
				+      - 13.601835409305883
			
 
				+      - 13.608734047373218
			
 
				+      - 13.538274892335826
			
 
				+      - 13.391518457210937
			
 
				+      - 13.382843811359622
			
 
				+      - 13.0524299456858
			
 
				+      - 12.785193828396269
			
 
				+      - 12.876608812372632
			
 
				+      - 12.59571918874957
			
 
				+      - 12.674484745567813
			
 
				+      - 12.57325195345546
			
 
				+      - 12.651938120109422
			
 
				+      - 12.556821722150424
			
 
				+      - 12.639338348530158
			
 
				+      - 12.610449431411217
			
 
				+      - 12.639992872912376
			
 
				+      - 12.697503827987052
			
 
				+      - 12.754788270377214
			
 
				+      - 12.837605043617405
			
 
				+      - 12.964379088501497
			
 
				+      - 13.11997048142582
			
 
				+      - 13.267395589173432
			
 
				+      - 13.384668687260483
			
 
				+      - 13.495000208959356
			
 
				+      - 13.606835320307384
			
 
				+      - 13.578073476073252
			
 
				+      - 13.689796531497368
			
 
				+      - 13.643079802391588
			
 
				+      - 13.7340755472615
			
 
				+      - 13.735199777666043
			
 
				+      - 13.79347692248429
			
 
				+      - 13.875183654243305
			
 
				+      - 13.967272256671393
			
 
				+      - 14.058507936754117
			
 
				+      - 14.114704594203507
			
 
				+      - 14.156211337193277
			
 
				+      - 14.14747081594401
			
 
				+      - 14.173917097974343
			
 
				+      - 14.22330474758318
			
 
				+      - 14.251272943225572
			
 
				+      - 14.230904505178053
			
 
				+      - 14.226937644205396
			
 
				+      - 14.222223350670225
			
 
				+      - 14.211638354996317
			
 
				+      - 14.208930098405544
			
 
				+      - 14.19476983404041
			
 
				+      - 14.2195925729048
			
 
				+      - 14.16490878238837
			
 
				+      - 14.115436751205117
			
 
				+      - 14.039442767347872
			
 
				+      - 13.976934063901625
			
 
				+      - 13.917068116556464
			
 
				+      - 13.856293662219073
			
 
				+      - 13.773769842100085
			
 
				+      - 13.706245521082796
			
 
				+      - 13.685052933361192
			
 
				+      - 13.68570131643094
			
 
				+      - 13.714811890011152
			
 
				+      - 13.751451253935347
			
 
				+      - 13.772212258132148
			
 
				+      - 13.76013448427468
			
 
				+      - 13.702368406557508
			
 
				+      - 13.600406368803617
			
 
				+      - 13.369574889658164
			
 
				+      - 12.998399608309988
			
 
				+      - 12.443732902848723
			
 
				+    std:
			
 
				+      - 3.729248515707457
			
 
				+      - 4.001623098079929
			
 
				+      - 4.570009061358065
			
 
				+      - 4.811572361201577
			
 
				+      - 5.010239923828185
			
 
				+      - 5.152145212706857
			
 
				+      - 5.223885876119451
			
 
				+      - 5.224443623432338
			
 
				+      - 5.161790275239061
			
 
				+      - 5.098988232815804
			
 
				+      - 5.090890035509122
			
 
				+      - 5.130345212529546
			
 
				+      - 5.165849688173366
			
 
				+      - 5.164761699263693
			
 
				+      - 5.131177988219367
			
 
				+      - 5.085522051815558
			
 
				+      - 5.035829108165894
			
 
				+      - 4.987478975310455
			
 
				+      - 4.932652442855969
			
 
				+      - 4.8650037198748075
			
 
				+      - 4.799238163232527
			
 
				+      - 4.727086345775988
			
 
				+      - 4.646858066575789
			
 
				+      - 4.5733249959652715
			
 
				+      - 4.51685060334288
			
 
				+      - 4.467449073425149
			
 
				+      - 4.4296881304192075
			
 
				+      - 4.4028775449713775
			
 
				+      - 4.397905653025904
			
 
				+      - 4.3862594566308015
			
 
				+      - 4.366485847923521
			
 
				+      - 4.344483498393771
			
 
				+      - 4.324692736391383
			
 
				+      - 4.310481738978154
			
 
				+      - 4.3053492473916
			
 
				+      - 4.3035205126659655
			
 
				+      - 4.2987898577000605
			
 
				+      - 4.287403454800855
			
 
				+      - 4.27087296372773
			
 
				+      - 4.25387490294079
			
 
				+      - 4.233513102251301
			
 
				+      - 4.212047255068752
			
 
				+      - 4.1810370158214445
			
 
				+      - 4.186014591107853
			
 
				+      - 4.194806047136222
			
 
				+      - 4.2183377208747075
			
 
				+      - 4.249293562464735
			
 
				+      - 4.268847210561774
			
 
				+      - 4.270455756367186
			
 
				+      - 4.25811368227528
			
 
				+      - 4.245975115347766
			
 
				+      - 4.23058010369271
			
 
				+      - 4.203075111087773
			
 
				+      - 4.20123812057283
			
 
				+      - 4.187143614375688
			
 
				+      - 4.172633823274146
			
 
				+      - 4.162541203161947
			
 
				+      - 4.156022884601996
			
 
				+      - 4.1618428838805706
			
 
				+      - 4.157259439238067
			
 
				+      - 4.139859013016601
			
 
				+      - 4.150685014911159
			
 
				+      - 4.152025499126372
			
 
				+      - 4.165010788120131
			
 
				+      - 4.15179422331336
			
 
				+      - 4.137041631098819
			
 
				+      - 4.10861757770052
			
 
				+      - 4.119916019361405
			
 
				+      - 4.131749366642117
			
 
				+      - 4.119438578634397
			
 
				+      - 4.100095269698108
			
 
				+      - 4.073900009963118
			
 
				+      - 4.0580796715728855
			
 
				+      - 4.050916705279105
			
 
				+      - 4.037976834115189
			
 
				+      - 4.023757063156459
			
 
				+      - 3.9987849927993353
			
 
				+      - 3.989251079820668
			
 
				+      - 3.9464430977885256
			
 
				+      - 3.8673932921278995
			
--- a/src/seamless_communication/cards/vocoder_pretssel_16khz.yaml
+++ b/src/seamless_communication/cards/vocoder_pretssel_16khz.yaml
@@ -0,0 +1,182 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the BSD-style license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+name: vocoder_pretssel_16khz
			
 
				+model_type: vocoder_pretssel
			
 
				+model_arch: 16khz
			
 
				+checkpoint: "file:///large_experiments/seamless/workstream/expressivity/oss/checkpoints/pretssel_melhifigan_wm-16khz.pt"
			
 
				+sample_rate: 16000
			
 
				+model_config:
			
 
				+  langs:
			
 
				+    - cmn
			
 
				+    - deu
			
 
				+    - eng
			
 
				+    - fra
			
 
				+    - ita
			
 
				+    - spa
			
 
				+  gcmvn_stats:
			
 
				+    mean:
			
 
				+      - 9.023406257490224
			
 
				+      - 9.406622923058864
			
 
				+      - 10.554165334059368
			
 
				+      - 11.475190058682356
			
 
				+      - 12.179117104099705
			
 
				+      - 12.603782921407062
			
 
				+      - 12.769632747861747
			
 
				+      - 12.714276772934083
			
 
				+      - 12.747612172560233
			
 
				+      - 12.750373688097946
			
 
				+      - 12.948050207790237
			
 
				+      - 13.121829398704277
			
 
				+      - 13.40130828476734
			
 
				+      - 13.58028050886195
			
 
				+      - 13.601835409305883
			
 
				+      - 13.608734047373218
			
 
				+      - 13.538274892335826
			
 
				+      - 13.391518457210937
			
 
				+      - 13.382843811359622
			
 
				+      - 13.0524299456858
			
 
				+      - 12.785193828396269
			
 
				+      - 12.876608812372632
			
 
				+      - 12.59571918874957
			
 
				+      - 12.674484745567813
			
 
				+      - 12.57325195345546
			
 
				+      - 12.651938120109422
			
 
				+      - 12.556821722150424
			
 
				+      - 12.639338348530158
			
 
				+      - 12.610449431411217
			
 
				+      - 12.639992872912376
			
 
				+      - 12.697503827987052
			
 
				+      - 12.754788270377214
			
 
				+      - 12.837605043617405
			
 
				+      - 12.964379088501497
			
 
				+      - 13.11997048142582
			
 
				+      - 13.267395589173432
			
 
				+      - 13.384668687260483
			
 
				+      - 13.495000208959356
			
 
				+      - 13.606835320307384
			
 
				+      - 13.578073476073252
			
 
				+      - 13.689796531497368
			
 
				+      - 13.643079802391588
			
 
				+      - 13.7340755472615
			
 
				+      - 13.735199777666043
			
 
				+      - 13.79347692248429
			
 
				+      - 13.875183654243305
			
 
				+      - 13.967272256671393
			
 
				+      - 14.058507936754117
			
 
				+      - 14.114704594203507
			
 
				+      - 14.156211337193277
			
 
				+      - 14.14747081594401
			
 
				+      - 14.173917097974343
			
 
				+      - 14.22330474758318
			
 
				+      - 14.251272943225572
			
 
				+      - 14.230904505178053
			
 
				+      - 14.226937644205396
			
 
				+      - 14.222223350670225
			
 
				+      - 14.211638354996317
			
 
				+      - 14.208930098405544
			
 
				+      - 14.19476983404041
			
 
				+      - 14.2195925729048
			
 
				+      - 14.16490878238837
			
 
				+      - 14.115436751205117
			
 
				+      - 14.039442767347872
			
 
				+      - 13.976934063901625
			
 
				+      - 13.917068116556464
			
 
				+      - 13.856293662219073
			
 
				+      - 13.773769842100085
			
 
				+      - 13.706245521082796
			
 
				+      - 13.685052933361192
			
 
				+      - 13.68570131643094
			
 
				+      - 13.714811890011152
			
 
				+      - 13.751451253935347
			
 
				+      - 13.772212258132148
			
 
				+      - 13.76013448427468
			
 
				+      - 13.702368406557508
			
 
				+      - 13.600406368803617
			
 
				+      - 13.369574889658164
			
 
				+      - 12.998399608309988
			
 
				+      - 12.443732902848723
			
 
				+    std:
			
 
				+      - 3.729248515707457
			
 
				+      - 4.001623098079929
			
 
				+      - 4.570009061358065
			
 
				+      - 4.811572361201577
			
 
				+      - 5.010239923828185
			
 
				+      - 5.152145212706857
			
 
				+      - 5.223885876119451
			
 
				+      - 5.224443623432338
			
 
				+      - 5.161790275239061
			
 
				+      - 5.098988232815804
			
 
				+      - 5.090890035509122
			
 
				+      - 5.130345212529546
			
 
				+      - 5.165849688173366
			
 
				+      - 5.164761699263693
			
 
				+      - 5.131177988219367
			
 
				+      - 5.085522051815558
			
 
				+      - 5.035829108165894
			
 
				+      - 4.987478975310455
			
 
				+      - 4.932652442855969
			
 
				+      - 4.8650037198748075
			
 
				+      - 4.799238163232527
			
 
				+      - 4.727086345775988
			
 
				+      - 4.646858066575789
			
 
				+      - 4.5733249959652715
			
 
				+      - 4.51685060334288
			
 
				+      - 4.467449073425149
			
 
				+      - 4.4296881304192075
			
 
				+      - 4.4028775449713775
			
 
				+      - 4.397905653025904
			
 
				+      - 4.3862594566308015
			
 
				+      - 4.366485847923521
			
 
				+      - 4.344483498393771
			
 
				+      - 4.324692736391383
			
 
				+      - 4.310481738978154
			
 
				+      - 4.3053492473916
			
 
				+      - 4.3035205126659655
			
 
				+      - 4.2987898577000605
			
 
				+      - 4.287403454800855
			
 
				+      - 4.27087296372773
			
 
				+      - 4.25387490294079
			
 
				+      - 4.233513102251301
			
 
				+      - 4.212047255068752
			
 
				+      - 4.1810370158214445
			
 
				+      - 4.186014591107853
			
 
				+      - 4.194806047136222
			
 
				+      - 4.2183377208747075
			
 
				+      - 4.249293562464735
			
 
				+      - 4.268847210561774
			
 
				+      - 4.270455756367186
			
 
				+      - 4.25811368227528
			
 
				+      - 4.245975115347766
			
 
				+      - 4.23058010369271
			
 
				+      - 4.203075111087773
			
 
				+      - 4.20123812057283
			
 
				+      - 4.187143614375688
			
 
				+      - 4.172633823274146
			
 
				+      - 4.162541203161947
			
 
				+      - 4.156022884601996
			
 
				+      - 4.1618428838805706
			
 
				+      - 4.157259439238067
			
 
				+      - 4.139859013016601
			
 
				+      - 4.150685014911159
			
 
				+      - 4.152025499126372
			
 
				+      - 4.165010788120131
			
 
				+      - 4.15179422331336
			
 
				+      - 4.137041631098819
			
 
				+      - 4.10861757770052
			
 
				+      - 4.119916019361405
			
 
				+      - 4.131749366642117
			
 
				+      - 4.119438578634397
			
 
				+      - 4.100095269698108
			
 
				+      - 4.073900009963118
			
 
				+      - 4.0580796715728855
			
 
				+      - 4.050916705279105
			
 
				+      - 4.037976834115189
			
 
				+      - 4.023757063156459
			
 
				+      - 3.9987849927993353
			
 
				+      - 3.989251079820668
			
 
				+      - 3.9464430977885256
			
 
				+      - 3.8673932921278995
			
--- a/src/seamless_communication/cli/expressivity/evaluate/evaluate.py
+++ b/src/seamless_communication/cli/expressivity/evaluate/evaluate.py
@@ -385,7 +385,7 @@ def main() -> None:
 
				         text_generation_opts=text_generation_opts,
			
 
				         unit_generation_opts=unit_generation_opts,
			
 
				         unit_generation_ngram_filtering=args.unit_generation_ngram_filtering,
			
 
				-        output_path=Path(args.output_path),
			
 
				+        output_path=args.output_path,
			
 
				         gcmvn_mean=torch.tensor(gcmvn_mean, device=device, dtype=dtype),
			
 
				         gcmvn_std=torch.tensor(gcmvn_std, device=device, dtype=dtype),
			
 
				         pretssel_model=args.pretssel_model,
			
--- a/src/seamless_communication/cli/expressivity/evaluate/pretssel_inference.py
+++ b/src/seamless_communication/cli/expressivity/evaluate/pretssel_inference.py
@@ -0,0 +1,373 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+import argparse
			
 
				+import contextlib
			
 
				+import logging
			
 
				+from argparse import Namespace
			
 
				+from dataclasses import dataclass
			
 
				+from pathlib import Path
			
 
				+from typing import Callable, Dict, List, Optional, Tuple, Union
			
 
				+
			
 
				+import torch
			
 
				+from torch.nn import Module
			
 
				+import torchaudio
			
 
				+from fairseq2.assets.card import AssetCard
			
 
				+from fairseq2.data import Collater, DataPipeline, FileMapper, SequenceData
			
 
				+from fairseq2.data.audio import (
			
 
				+    AudioDecoder,
			
 
				+    WaveformToFbankConverter,
			
 
				+    WaveformToFbankOutput,
			
 
				+)
			
 
				+from fairseq2.data.text import StrSplitter, TextTokenizer, read_text
			
 
				+from fairseq2.generation import SequenceGeneratorOptions
			
 
				+from fairseq2.typing import DataType, Device
			
 
				+from fairseq2.nn.padding import get_seqs_and_padding_mask
			
 
				+from sacrebleu.metrics import BLEU  # type: ignore[attr-defined]
			
 
				+from torch import Tensor
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+from seamless_communication.models.unity import UnitTokenizer
			
 
				+from seamless_communication.cli.m4t.evaluate.evaluate import (
			
 
				+    adjust_output_for_corrupted_inputs,
			
 
				+    count_lines,
			
 
				+)
			
 
				+from seamless_communication.cli.m4t.predict import (
			
 
				+    add_inference_arguments,
			
 
				+    set_generation_opts,
			
 
				+)
			
 
				+from seamless_communication.inference import BatchedSpeechOutput, Translator
			
 
				+from seamless_communication.models.unity import (
			
 
				+    load_gcmvn_stats,
			
 
				+    load_unity_text_tokenizer,
			
 
				+    load_unity_unit_tokenizer,
			
 
				+)
			
 
				+from seamless_communication.models.generator.loader import load_pretssel_vocoder_model
			
 
				+
			
 
				+logging.basicConfig(
			
 
				+    level=logging.INFO,
			
 
				+    format="%(asctime)s %(levelname)s -- %(name)s: %(message)s",
			
 
				+)
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+class PretsselGenerator(Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        pretssel_name_or_card: Union[str, AssetCard],
			
 
				+        unit_tokenizer: UnitTokenizer,
			
 
				+        device: Device,
			
 
				+        dtype: DataType = torch.float16,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        # Load the model.
			
 
				+        if device == torch.device("cpu"):
			
 
				+            dtype = torch.float32
			
 
				+
			
 
				+        self.device = device
			
 
				+        self.dtype = dtype
			
 
				+
			
 
				+        self.pretssel_model = load_pretssel_vocoder_model(
			
 
				+            pretssel_name_or_card,
			
 
				+            device=device,
			
 
				+            dtype=dtype,
			
 
				+        )
			
 
				+        self.pretssel_model.eval()
			
 
				+
			
 
				+        vocoder_model_card = asset_store.retrieve_card(vocoder_name_or_card)
			
 
				+        self.output_sample_rate = vocoder_model_card.field("sample_rate").as_(int)
			
 
				+
			
 
				+        self.unit_tokenizer = unit_tokenizer
			
 
				+        self.unit_collate = Collater(pad_value=unit_tokenizer.vocab_info.pad_idx)
			
 
				+        self.duration_collate = Collater(pad_value=0)
			
 
				+
			
 
				+    @torch.inference_mode()
			
 
				+    def predict(
			
 
				+        self,
			
 
				+        units: List[List[int]],
			
 
				+        tgt_lang: str,
			
 
				+        prosody_encoder_input: SequenceData,
			
 
				+    ) -> BatchedSpeechOutput:
			
 
				+        audio_wavs = []
			
 
				+        unit_eos_token = torch.tensor(
			
 
				+            [self.unit_tokenizer.vocab_info.eos_idx],
			
 
				+            device=self.device,
			
 
				+        )
			
 
				+
			
 
				+        prosody_input_seqs = prosody_encoder_input["seqs"]
			
 
				+        prosody_input_lens = prosody_encoder_input["seq_lens"]
			
 
				+
			
 
				+        for i, u in enumerate(units):
			
 
				+            unit = torch.tensor(u).to(unit_eos_token)
			
 
				+
			
 
				+            # adjust the control symbols for the embedding
			
 
				+            unit += 4
			
 
				+            unit = torch.cat([unit, unit_eos_token], dim=0)
			
 
				+
			
 
				+            unit, duration = torch.unique_consecutive(unit, return_counts=True)
			
 
				+
			
 
				+            # adjust for the last eos token
			
 
				+            duration[-1] = 0
			
 
				+
			
 
				+            duration *= 2
			
 
				+
			
 
				+            prosody_input_seq = prosody_input_seqs[i][:prosody_input_lens[i]]
			
 
				+
			
 
				+            audio_wav = self.pretssel_model(
			
 
				+                unit,
			
 
				+                tgt_lang,
			
 
				+                prosody_input_seq,
			
 
				+                durations=duration.unsqueeze(0),
			
 
				+            )
			
 
				+
			
 
				+            audio_wavs.append(audio_wav)
			
 
				+
			
 
				+        return BatchedSpeechOutput(
			
 
				+            units=units,
			
 
				+            audio_wavs=audio_wavs,
			
 
				+            sample_rate=self.output_sample_rate,
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+def build_data_pipeline(
			
 
				+    args: Namespace,
			
 
				+    text_tokenizer: TextTokenizer,
			
 
				+    device: Device,
			
 
				+    dtype: DataType,
			
 
				+    gcmvn_mean: Tensor,
			
 
				+    gcmvn_std: Tensor,
			
 
				+) -> DataPipeline:
			
 
				+    with open(args.data_file, "r") as f:
			
 
				+        header = f.readline().strip("\n").split("\t")
			
 
				+
			
 
				+    n_parallel = 4
			
 
				+
			
 
				+    split_tsv = StrSplitter(names=header)
			
 
				+
			
 
				+    pipeline_builder = read_text(args.data_file, rtrim=True).skip(1).map(split_tsv)
			
 
				+
			
 
				+    assert args.audio_root_dir is not None
			
 
				+
			
 
				+    map_file = FileMapper(root_dir=args.audio_root_dir, cached_fd_count=10)
			
 
				+
			
 
				+    pipeline_builder.map(map_file, selector="audio", num_parallel_calls=n_parallel)
			
 
				+
			
 
				+    decode_audio = AudioDecoder(dtype=torch.float32, device=device)
			
 
				+
			
 
				+    convert_to_fbank = WaveformToFbankConverter(
			
 
				+        num_mel_bins=80,
			
 
				+        waveform_scale=2**15,
			
 
				+        channel_last=True,
			
 
				+        standardize=False,
			
 
				+        device=device,
			
 
				+        dtype=dtype,
			
 
				+    )
			
 
				+
			
 
				+    def normalize_fbank(data: WaveformToFbankOutput) -> WaveformToFbankOutput:
			
 
				+        fbank = data["fbank"]
			
 
				+        std, mean = torch.std_mean(fbank, dim=0)
			
 
				+        data["fbank"] = fbank.subtract(mean).divide(std)
			
 
				+        data["gcmvn_fbank"] = fbank.subtract(gcmvn_mean).divide(gcmvn_std)
			
 
				+        return data
			
 
				+
			
 
				+    pipeline_builder.map(
			
 
				+        [decode_audio, convert_to_fbank, normalize_fbank],
			
 
				+        selector="audio.data",
			
 
				+        num_parallel_calls=n_parallel,
			
 
				+    )
			
 
				+
			
 
				+    pipeline_builder.bucket(bucket_size=args.batch_size)
			
 
				+
			
 
				+    collate = Collater(pad_value=0, pad_to_multiple=1)
			
 
				+
			
 
				+    pipeline_builder.map(collate, num_parallel_calls=n_parallel)
			
 
				+
			
 
				+    pipeline_builder.prefetch(4)
			
 
				+
			
 
				+    return pipeline_builder.and_return()
			
 
				+
			
 
				+
			
 
				+def main() -> None:
			
 
				+    parser = argparse.ArgumentParser(description="Running PretsselModel inference")
			
 
				+    parser.add_argument("data_file", type=Path, help="Data file (.tsv) to be evaluated.")
			
 
				+
			
 
				+    parser = add_inference_arguments(parser)
			
 
				+    parser.add_argument(
			
 
				+        "--batch_size",
			
 
				+        type=int,
			
 
				+        help="Inference batch size.",
			
 
				+        default=4,
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--audio_root_dir",
			
 
				+        type=Path,
			
 
				+        help="Root directory for the audio filenames in the data file.",
			
 
				+        default="",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--ref_field",
			
 
				+        type=str,
			
 
				+        help="Reference target text field to compute the BLEU score against.",
			
 
				+        default="tgt_text",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--duration_factor",
			
 
				+        type=float,
			
 
				+        help="The duration factor for NAR T2U model. Expressivity model uses 1.1",
			
 
				+        default=1.1,
			
 
				+    )
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    if torch.cuda.is_available():
			
 
				+        device = torch.device("cuda:0")
			
 
				+        dtype = torch.float16
			
 
				+    else:
			
 
				+        device = torch.device("cpu")
			
 
				+        dtype = torch.float32
			
 
				+
			
 
				+    text_tokenizer = load_unity_text_tokenizer(args.model_name)
			
 
				+    unit_tokenizer = load_unity_unit_tokenizer(args.model_name)
			
 
				+
			
 
				+    _gcmvn_mean, _gcmvn_std = load_gcmvn_stats(args.vocoder_name)
			
 
				+    gcmvn_mean = torch.tensor(_gcmvn_mean, device=device, dtype=dtype)
			
 
				+    gcmvn_std = torch.tensor(_gcmvn_std, device=device, dtype=dtype)
			
 
				+
			
 
				+    pipeline = build_data_pipeline(
			
 
				+        args, text_tokenizer, device, dtype, gcmvn_mean, gcmvn_std
			
 
				+    )
			
 
				+
			
 
				+    translator = Translator(
			
 
				+        args.model_name,
			
 
				+        vocoder_name_or_card=None,
			
 
				+        device=device,
			
 
				+        text_tokenizer=text_tokenizer,
			
 
				+        dtype=dtype,
			
 
				+    )
			
 
				+
			
 
				+    text_generation_opts, unit_generation_opts = set_generation_opts(args)
			
 
				+
			
 
				+    logger.info(f"{text_generation_opts=}")
			
 
				+    logger.info(f"{unit_generation_opts=}")
			
 
				+    logger.info(
			
 
				+        f"unit_generation_ngram_filtering={args.unit_generation_ngram_filtering}"
			
 
				+    )
			
 
				+
			
 
				+    pretssel_generator = PretsselGenerator(
			
 
				+        args.vocoder_name,
			
 
				+        unit_tokenizer=unit_tokenizer,
			
 
				+        device=device,
			
 
				+        dtype=dtype,
			
 
				+    )
			
 
				+
			
 
				+    total_steps = count_lines(args.data_file) - 1
			
 
				+    progress_bar = tqdm(total=total_steps)
			
 
				+
			
 
				+    output_path = args.output_path / args.data_file.stem
			
 
				+    output_path.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    waveforms_dir = output_path / f"waveform"
			
 
				+    waveforms_dir.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    hyps = []
			
 
				+    refs = []
			
 
				+
			
 
				+    with contextlib.ExitStack() as stack:
			
 
				+        hyp_file = stack.enter_context(
			
 
				+            open(output_path / f"text_output-{args.data_file.stem}.txt", "w")
			
 
				+        )
			
 
				+        unit_file = stack.enter_context(
			
 
				+            open(output_path / f"unit_output-{args.data_file.stem}.txt", "w")
			
 
				+        )
			
 
				+
			
 
				+        sample_id = 0
			
 
				+        for example in pipeline:
			
 
				+            valid_sequences: Optional[Tensor] = None
			
 
				+            src = example["audio"]["data"]["fbank"]
			
 
				+            # Skip corrupted audio tensors.
			
 
				+            valid_sequences = ~torch.any(
			
 
				+                torch.any(torch.isnan(src["seqs"]), dim=1), dim=1
			
 
				+            )
			
 
				+            if not valid_sequences.all():
			
 
				+                logger.warning(
			
 
				+                    f"Sample IDs {sample_id} to {sample_id + args.batch_size} has some corrupted input."
			
 
				+                )
			
 
				+                src["seqs"] = src["seqs"][valid_sequences]
			
 
				+                src["seq_lens"] = src["seq_lens"][valid_sequences]
			
 
				+
			
 
				+            # Skip performing inference when the input is entirely corrupted.
			
 
				+            if src["seqs"].numel() > 0:
			
 
				+                prosody_encoder_input = example["audio"]["data"]["gcmvn_fbank"]
			
 
				+                text_output, unit_output = translator.predict(
			
 
				+                    src,
			
 
				+                    args.task,
			
 
				+                    args.tgt_lang,
			
 
				+                    src_lang=args.src_lang,
			
 
				+                    text_generation_opts=text_generation_opts,
			
 
				+                    unit_generation_opts=unit_generation_opts,
			
 
				+                    unit_generation_ngram_filtering=args.unit_generation_ngram_filtering,
			
 
				+                    duration_factor=args.duration_factor,
			
 
				+                    prosody_encoder_input=prosody_encoder_input,
			
 
				+                )
			
 
				+
			
 
				+                assert unit_output is not None
			
 
				+                speech_output = pretssel_generator.predict(
			
 
				+                    unit_output.units,
			
 
				+                    tgt_lang=args.tgt_lang,
			
 
				+                    prosody_encoder_input=prosody_encoder_input,
			
 
				+                )
			
 
				+
			
 
				+            else:
			
 
				+                text_output = []
			
 
				+                speech_output = BatchedSpeechOutput(units=[], audio_wavs=[])
			
 
				+
			
 
				+            if valid_sequences is not None and not valid_sequences.all():
			
 
				+                text_output, speech_output = adjust_output_for_corrupted_inputs(  # type: ignore[assignment]
			
 
				+                    valid_sequences,
			
 
				+                    text_output,
			
 
				+                    speech_output,
			
 
				+                )
			
 
				+
			
 
				+            hyps += [str(s) for s in text_output]
			
 
				+            refs += [str(s) for s in example[args.ref_field]]
			
 
				+
			
 
				+            for i in range(len(text_output)):
			
 
				+                t = text_output[i]
			
 
				+                idx = str(example["id"][i])
			
 
				+                hyp_file.write(f"{t}\n")
			
 
				+
			
 
				+                u = speech_output.units[i]
			
 
				+                str_units = [str(i) for i in u]
			
 
				+                unit_file.write(" ".join(str_units) + "\n")
			
 
				+                torchaudio.save(
			
 
				+                    waveforms_dir / f"{idx}_pred.wav",
			
 
				+                    speech_output.audio_wavs[i][0].to(torch.float32).cpu(),
			
 
				+                    sample_rate=speech_output.sample_rate,
			
 
				+                )
			
 
				+
			
 
				+                sample_id += 1
			
 
				+                progress_bar.update(1)
			
 
				+
			
 
				+    progress_bar.close()
			
 
				+    logger.info(f"Processed {len(hyps)} hyps, {len(refs)} refs")
			
 
				+
			
 
				+    assert len(hyps) == len(refs)
			
 
				+    if len(hyps) > 0:
			
 
				+        if args.tgt_lang in ("cmn", "jpn", "lao", "mya", "tha"):
			
 
				+            tokenizer = "char"
			
 
				+        else:
			
 
				+            tokenizer = "13a"
			
 
				+
			
 
				+        bleu = BLEU(tokenize=tokenizer)
			
 
				+        score = bleu.corpus_score(hyps, [refs])
			
 
				+        bleu_filename = output_path / f"{args.data_file.stem}_text_output_bleu.json"
			
 
				+        with open(bleu_filename, "w") as f:
			
 
				+            f.write(score.format(signature=str(bleu.get_signature()), is_json=True))
			
 
				+        logger.info(score.format(signature=bleu.get_signature()))
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/src/seamless_communication/cli/m4t/evaluate/evaluate.py
+++ b/src/seamless_communication/cli/m4t/evaluate/evaluate.py
@@ -424,7 +424,7 @@ def main(optional_args: Optional[Dict[str, Any]] = None) -> None:
 
				         text_generation_opts=text_generation_opts,
			
 
				         unit_generation_opts=unit_generation_opts,
			
 
				         unit_generation_ngram_filtering=args.unit_generation_ngram_filtering,
			
 
				-        output_path=Path(args.output_path),
			
 
				+        output_path=args.output_path,
			
 
				     )
			
 
				     # fmt: on
			
 
				     logger.info(f"Running inference on {device=} with {dtype=}, {ctx.batch_size=}.")
			
--- a/src/seamless_communication/cli/m4t/predict/predict.py
+++ b/src/seamless_communication/cli/m4t/predict/predict.py
@@ -6,6 +6,7 @@
 
				 import argparse
			
 
				 import logging
			
 
				 from argparse import Namespace
			
 
				+from pathlib import Path
			
 
				 from typing import Tuple
			
 
				 
			
 
				 import torch
			
@@ -35,7 +36,7 @@ def add_inference_arguments(parser: argparse.ArgumentParser) -> argparse.Argumen
 
				     )
			
 
				     parser.add_argument(
			
 
				         "--output_path",
			
 
				-        type=str,
			
 
				+        type=Path,
			
 
				         help="Path to save the generated audio.",
			
 
				         default=None,
			
 
				     )
			
@@ -167,7 +168,7 @@ def set_generation_opts(
 
				     return text_generation_opts, unit_generation_opts
			
 
				 
			
 
				 
			
 
				-def main():
			
 
				+def main() -> None:
			
 
				     parser = argparse.ArgumentParser(
			
 
				         description="M4T inference on supported tasks using Translator."
			
 
				     )
			
--- a/src/seamless_communication/models/generator/__init__.py
+++ b/src/seamless_communication/models/generator/__init__.py
@@ -0,0 +1,5 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
--- a/src/seamless_communication/models/generator/builder.py
+++ b/src/seamless_communication/models/generator/builder.py
@@ -0,0 +1,506 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+from dataclasses import dataclass
			
 
				+from typing import Any, Dict, List, Literal, Optional, Tuple
			
 
				+
			
 
				+from fairseq2.data import VocabularyInfo
			
 
				+from fairseq2.models.utils.arch_registry import ArchitectureRegistry
			
 
				+from fairseq2.nn.embedding import StandardEmbedding, init_scaled_embedding
			
 
				+from fairseq2.nn.position_encoder import SinusoidalPositionEncoder
			
 
				+from fairseq2.nn.projection import Linear
			
 
				+from fairseq2.nn.transformer import (
			
 
				+    MultiheadAttention,
			
 
				+    StandardMultiheadAttention,
			
 
				+    TransformerNormOrder,
			
 
				+    create_default_sdpa,
			
 
				+)
			
 
				+from fairseq2.typing import DataType, Device
			
 
				+from torch.nn import Conv1d
			
 
				+
			
 
				+from seamless_communication.models.generator.ecapa_tdnn_builder import (
			
 
				+    EcapaTDNNBuilder,
			
 
				+    EcapaTDNNConfig,
			
 
				+    ecapa_tdnn_archs,
			
 
				+)
			
 
				+from seamless_communication.models.generator.vocoder import (
			
 
				+    PretsselDecoderFrontend,
			
 
				+    PretsselEncoderFrontend,
			
 
				+    PretsselVocoder,
			
 
				+)
			
 
				+from seamless_communication.models.unity.fft_decoder import FeedForwardTransformer
			
 
				+from seamless_communication.models.unity.fft_decoder_layer import (
			
 
				+    Conv1dBlock,
			
 
				+    FeedForwardTransformerLayer,
			
 
				+)
			
 
				+from seamless_communication.models.unity.length_regulator import (
			
 
				+    VarianceAdaptor,
			
 
				+    VariancePredictor,
			
 
				+)
			
 
				+from seamless_communication.models.unity.t2u_builder import VariancePredictorConfig
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class PretsselEncoderFrontendConfig:
			
 
				+    prosody_encoder_config: EcapaTDNNConfig
			
 
				+    dropout: float
			
 
				+    lang_embed_dim: Optional[int] = None
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class FFTLayerConfig:
			
 
				+    attention_heads: int
			
 
				+    hidden_dim: int
			
 
				+    kernel_size: int
			
 
				+    dropout: float
			
 
				+    conv1d_dropout: float
			
 
				+    film_cond_dim: int
			
 
				+    use_film: bool = False
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class PretsselDecoderFrontendConfig:
			
 
				+    upsampling_type: Literal["gaussian", "hard"]
			
 
				+    variance_predictor_config: VariancePredictorConfig
			
 
				+    add_variance_parallel: bool
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class VocoderConfig:
			
 
				+    """Holds the configuration of a Vocoder model."""
			
 
				+
			
 
				+    encoder_frontend_config: PretsselEncoderFrontendConfig
			
 
				+    fft_layer_config: FFTLayerConfig
			
 
				+    decoder_frontend_config: PretsselDecoderFrontendConfig
			
 
				+    pn_conv_dim: int
			
 
				+    pn_layers: int
			
 
				+    pn_conv_kernel_size: int
			
 
				+    pn_dropout: float
			
 
				+    vocab_info: VocabularyInfo
			
 
				+    model_dim: int
			
 
				+    max_seq_len: int
			
 
				+    encoder_layers: int
			
 
				+    decoder_layers: int
			
 
				+    mel_dim: int
			
 
				+    langs: List  # type: ignore[type-arg]
			
 
				+    upsample_rates: List[int]
			
 
				+    upsample_kernel_sizes: List[int]
			
 
				+    upsample_initial_channel: int
			
 
				+    resblock_kernel_sizes: List[int]
			
 
				+    resblock_dilation_sizes: List[List[int]]
			
 
				+    channels: int
			
 
				+    dimension: int
			
 
				+    n_filters: int
			
 
				+    ratios: List[int]
			
 
				+    norm: Literal["none", "weight_norm", "spectral_norm", "time_group_norm"]
			
 
				+    norm_params: Dict[str, Any]
			
 
				+    kernel_size: int
			
 
				+    last_kernel_size: int
			
 
				+    residual_kernel_size: int
			
 
				+    causal: bool
			
 
				+    pad_mode: str
			
 
				+    true_skip: bool
			
 
				+    compress: int
			
 
				+    lstm: int
			
 
				+    disable_norm_outer_blocks: int
			
 
				+    trim_right_ratio: float
			
 
				+    gcmvn_stats: Dict[str, List]  # type: ignore[type-arg]
			
 
				+
			
 
				+
			
 
				+vocoder_archs = ArchitectureRegistry[VocoderConfig]("vocoder_pretssel")
			
 
				+
			
 
				+
			
 
				+vocoder_arch = vocoder_archs.decorator
			
 
				+
			
 
				+
			
 
				+def pretssel_config() -> (
			
 
				+    Tuple[PretsselEncoderFrontendConfig, FFTLayerConfig, PretsselDecoderFrontendConfig]
			
 
				+):
			
 
				+    prosody_encoder_config = ecapa_tdnn_archs.get_config("base")
			
 
				+
			
 
				+    encoder_frontend_config = PretsselEncoderFrontendConfig(
			
 
				+        prosody_encoder_config=prosody_encoder_config,
			
 
				+        dropout=0.2,
			
 
				+        lang_embed_dim=64,
			
 
				+    )
			
 
				+
			
 
				+    fft_layer_config = FFTLayerConfig(
			
 
				+        attention_heads=2,
			
 
				+        hidden_dim=1024,
			
 
				+        kernel_size=9,
			
 
				+        dropout=0.0,
			
 
				+        conv1d_dropout=0.2,
			
 
				+        use_film=True,
			
 
				+        film_cond_dim=576,
			
 
				+    )
			
 
				+
			
 
				+    variance_predictor_config = VariancePredictorConfig(
			
 
				+        var_pred_hidden_dim=512,
			
 
				+        var_pred_kernel_size=5,
			
 
				+        var_pred_dropout=0.5,
			
 
				+        use_film=True,
			
 
				+        film_cond_dim=576,
			
 
				+    )
			
 
				+
			
 
				+    decoder_frontend_config = PretsselDecoderFrontendConfig(
			
 
				+        upsampling_type="gaussian",
			
 
				+        variance_predictor_config=variance_predictor_config,
			
 
				+        add_variance_parallel=True,
			
 
				+    )
			
 
				+    return (
			
 
				+        encoder_frontend_config,
			
 
				+        fft_layer_config,
			
 
				+        decoder_frontend_config,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+@vocoder_arch("16khz")
			
 
				+def _16khz_vocoder() -> VocoderConfig:
			
 
				+    (
			
 
				+        encoder_frontend_config,
			
 
				+        fft_layer_config,
			
 
				+        decoder_frontend_config,
			
 
				+    ) = pretssel_config()
			
 
				+
			
 
				+    return VocoderConfig(
			
 
				+        encoder_frontend_config=encoder_frontend_config,
			
 
				+        fft_layer_config=fft_layer_config,
			
 
				+        decoder_frontend_config=decoder_frontend_config,
			
 
				+        pn_conv_dim=512,
			
 
				+        pn_layers=5,
			
 
				+        pn_conv_kernel_size=5,
			
 
				+        pn_dropout=0.5,
			
 
				+        vocab_info=VocabularyInfo(
			
 
				+            size=10004, unk_idx=3, bos_idx=0, eos_idx=2, pad_idx=1
			
 
				+        ),
			
 
				+        model_dim=256,
			
 
				+        max_seq_len=4000,
			
 
				+        encoder_layers=4,
			
 
				+        decoder_layers=4,
			
 
				+        mel_dim=80,
			
 
				+        langs=[],
			
 
				+        upsample_rates=[5, 4, 4, 2],
			
 
				+        upsample_kernel_sizes=[10, 8, 8, 4],
			
 
				+        upsample_initial_channel=512,
			
 
				+        resblock_kernel_sizes=[3, 7, 11],
			
 
				+        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
			
 
				+        channels=1,
			
 
				+        dimension=128,
			
 
				+        n_filters=32,
			
 
				+        ratios=[8, 5, 4, 2],
			
 
				+        norm="weight_norm",
			
 
				+        norm_params={},
			
 
				+        kernel_size=7,
			
 
				+        last_kernel_size=7,
			
 
				+        residual_kernel_size=3,
			
 
				+        causal=False,
			
 
				+        pad_mode="constant",
			
 
				+        true_skip=True,
			
 
				+        compress=2,
			
 
				+        lstm=2,
			
 
				+        disable_norm_outer_blocks=0,
			
 
				+        trim_right_ratio=1.0,
			
 
				+        gcmvn_stats={},
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+@vocoder_arch("24khz")
			
 
				+def _24khz_vocoder() -> VocoderConfig:
			
 
				+    (
			
 
				+        encoder_frontend_config,
			
 
				+        fft_layer_config,
			
 
				+        decoder_frontend_config,
			
 
				+    ) = pretssel_config()
			
 
				+
			
 
				+    return VocoderConfig(
			
 
				+        encoder_frontend_config=encoder_frontend_config,
			
 
				+        fft_layer_config=fft_layer_config,
			
 
				+        decoder_frontend_config=decoder_frontend_config,
			
 
				+        pn_conv_dim=512,
			
 
				+        pn_layers=5,
			
 
				+        pn_conv_kernel_size=5,
			
 
				+        pn_dropout=0.5,
			
 
				+        vocab_info=VocabularyInfo(
			
 
				+            size=10004, unk_idx=3, bos_idx=0, eos_idx=2, pad_idx=1
			
 
				+        ),
			
 
				+        model_dim=256,
			
 
				+        max_seq_len=4000,
			
 
				+        encoder_layers=4,
			
 
				+        decoder_layers=4,
			
 
				+        mel_dim=80,
			
 
				+        langs=[],
			
 
				+        upsample_rates=[5, 4, 4, 3],
			
 
				+        upsample_kernel_sizes=[10, 8, 8, 6],
			
 
				+        upsample_initial_channel=512,
			
 
				+        resblock_kernel_sizes=[3, 7, 11],
			
 
				+        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
			
 
				+        channels=1,
			
 
				+        dimension=128,
			
 
				+        n_filters=32,
			
 
				+        ratios=[8, 5, 4, 2],
			
 
				+        norm="weight_norm",
			
 
				+        norm_params={},
			
 
				+        kernel_size=7,
			
 
				+        last_kernel_size=7,
			
 
				+        residual_kernel_size=3,
			
 
				+        causal=False,
			
 
				+        pad_mode="constant",
			
 
				+        true_skip=True,
			
 
				+        compress=2,
			
 
				+        lstm=2,
			
 
				+        disable_norm_outer_blocks=0,
			
 
				+        trim_right_ratio=1.0,
			
 
				+        gcmvn_stats={},
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+class PretsselVocoderBuilder:
			
 
				+    config: VocoderConfig
			
 
				+    prosody_encoder_builder: EcapaTDNNBuilder
			
 
				+    device: Optional[Device] = None
			
 
				+    dtype: Optional[DataType] = None
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: VocoderConfig,
			
 
				+        prosody_encoder_builder: EcapaTDNNBuilder,
			
 
				+        *,
			
 
				+        device: Optional[Device] = None,
			
 
				+        dtype: Optional[DataType] = None,
			
 
				+    ) -> None:
			
 
				+        """
			
 
				+        :param config:
			
 
				+            The configuration to use.
			
 
				+        :param device:
			
 
				+            The device on which to initialize modules.
			
 
				+        :param dtype:
			
 
				+            The data type of module parameters and buffers.
			
 
				+        """
			
 
				+        self.config = config
			
 
				+        self.prosody_encoder_builder = prosody_encoder_builder
			
 
				+        self.device, self.dtype = device, dtype
			
 
				+
			
 
				+    def build_embed_tokens(self) -> StandardEmbedding:
			
 
				+        """Build a unit embedding table."""
			
 
				+
			
 
				+        return StandardEmbedding(
			
 
				+            num_embeddings=self.config.vocab_info.size,
			
 
				+            embedding_dim=self.config.model_dim,
			
 
				+            init_fn=init_scaled_embedding,
			
 
				+            device=self.device,
			
 
				+            dtype=self.dtype,
			
 
				+        )
			
 
				+
			
 
				+    def build_fft(self, num_layers: int) -> FeedForwardTransformer:
			
 
				+        """Build a Transformer encoder."""
			
 
				+
			
 
				+        layers = [self.build_fft_layer() for _ in range(num_layers)]
			
 
				+
			
 
				+        return FeedForwardTransformer(
			
 
				+            layers,
			
 
				+            norm_order=TransformerNormOrder.POST,
			
 
				+            device=self.device,
			
 
				+            dtype=self.dtype,
			
 
				+        )
			
 
				+
			
 
				+    def build_fft_layer(self) -> FeedForwardTransformerLayer:
			
 
				+        """Build a Transformer decoder layer."""
			
 
				+
			
 
				+        self_attn = self.build_attention(self.config.fft_layer_config.attention_heads)
			
 
				+
			
 
				+        conv1d = Conv1dBlock(
			
 
				+            self.config.model_dim,
			
 
				+            self.config.fft_layer_config.hidden_dim,
			
 
				+            self.config.fft_layer_config.kernel_size,
			
 
				+            bias=True,
			
 
				+            device=self.device,
			
 
				+            dtype=self.dtype,
			
 
				+        )
			
 
				+
			
 
				+        return FeedForwardTransformerLayer(
			
 
				+            self_attn,
			
 
				+            conv1d,
			
 
				+            dropout_p=0.0,  # fairseq1 doesn't have this
			
 
				+            conv1d_dropout_p=self.config.fft_layer_config.conv1d_dropout,
			
 
				+            use_film=self.config.fft_layer_config.use_film,
			
 
				+            film_cond_dim=self.config.fft_layer_config.film_cond_dim,
			
 
				+            device=self.device,
			
 
				+            dtype=self.dtype,
			
 
				+        )
			
 
				+
			
 
				+    def build_attention(self, num_heads: int) -> MultiheadAttention:
			
 
				+        """Build a Transformer multi-head attention layer."""
			
 
				+
			
 
				+        sdpa = create_default_sdpa(attn_dropout_p=self.config.fft_layer_config.dropout)
			
 
				+
			
 
				+        return StandardMultiheadAttention(
			
 
				+            self.config.model_dim,
			
 
				+            num_heads,
			
 
				+            sdpa=sdpa,
			
 
				+            device=self.device,
			
 
				+            dtype=self.dtype,
			
 
				+        )
			
 
				+
			
 
				+    def build_variance_adaptor(
			
 
				+        self,
			
 
				+        decoder_frontend_config: PretsselDecoderFrontendConfig,
			
 
				+    ) -> VarianceAdaptor:
			
 
				+        """Build a variance adaptor module."""
			
 
				+
			
 
				+        variance_predictor_config = decoder_frontend_config.variance_predictor_config
			
 
				+
			
 
				+        pitch_predictor = VariancePredictor(
			
 
				+            self.config.model_dim,
			
 
				+            variance_predictor_config.var_pred_hidden_dim,
			
 
				+            variance_predictor_config.var_pred_kernel_size,
			
 
				+            variance_predictor_config.var_pred_dropout,
			
 
				+            use_film=variance_predictor_config.use_film,
			
 
				+            film_cond_dim=variance_predictor_config.film_cond_dim,
			
 
				+            device=self.device,
			
 
				+            dtype=self.dtype,
			
 
				+        )
			
 
				+
			
 
				+        embed_pitch = Conv1d(1, self.config.model_dim, kernel_size=1)
			
 
				+
			
 
				+        vuv_predictor = VariancePredictor(
			
 
				+            self.config.model_dim,
			
 
				+            variance_predictor_config.var_pred_hidden_dim,
			
 
				+            variance_predictor_config.var_pred_kernel_size,
			
 
				+            variance_predictor_config.var_pred_dropout,
			
 
				+            use_film=variance_predictor_config.use_film,
			
 
				+            film_cond_dim=variance_predictor_config.film_cond_dim,
			
 
				+            device=self.device,
			
 
				+            dtype=self.dtype,
			
 
				+        )
			
 
				+
			
 
				+        energy_predictor = VariancePredictor(
			
 
				+            self.config.model_dim,
			
 
				+            variance_predictor_config.var_pred_hidden_dim,
			
 
				+            variance_predictor_config.var_pred_kernel_size,
			
 
				+            variance_predictor_config.var_pred_dropout,
			
 
				+            use_film=variance_predictor_config.use_film,
			
 
				+            film_cond_dim=variance_predictor_config.film_cond_dim,
			
 
				+            device=self.device,
			
 
				+            dtype=self.dtype,
			
 
				+        )
			
 
				+
			
 
				+        embed_energy = Conv1d(1, self.config.model_dim, kernel_size=1)
			
 
				+
			
 
				+        variance_adaptor = VarianceAdaptor(
			
 
				+            duration_predictor=None,
			
 
				+            pitch_predictor=pitch_predictor,
			
 
				+            embed_pitch=embed_pitch,
			
 
				+            vuv_predictor=vuv_predictor,
			
 
				+            energy_predictor=energy_predictor,
			
 
				+            embed_energy=embed_energy,
			
 
				+            add_variance_parallel=decoder_frontend_config.add_variance_parallel,
			
 
				+            upsampling_type=decoder_frontend_config.upsampling_type,
			
 
				+        )
			
 
				+
			
 
				+        return variance_adaptor
			
 
				+
			
 
				+    def build_model(self) -> PretsselVocoder:
			
 
				+        """build the pretssel vocoder."""
			
 
				+        prosody_encoder = self.prosody_encoder_builder.build_model()
			
 
				+        embed_tokens = self.build_embed_tokens()
			
 
				+
			
 
				+        embed_positions = SinusoidalPositionEncoder(
			
 
				+            self.config.model_dim,
			
 
				+            self.config.max_seq_len,
			
 
				+            _legacy_pad_idx=self.config.vocab_info.pad_idx,
			
 
				+            device=self.device,
			
 
				+        )
			
 
				+        lang_to_index = {l: i for i, l in enumerate(self.config.langs)}
			
 
				+        encoder_frontend = PretsselEncoderFrontend(
			
 
				+            prosody_encoder,
			
 
				+            embed_tokens,
			
 
				+            embed_positions,
			
 
				+            lang_to_index,
			
 
				+            lang_embed_dim=self.config.encoder_frontend_config.lang_embed_dim,
			
 
				+            dropout_p=self.config.encoder_frontend_config.dropout,
			
 
				+            device=self.device,
			
 
				+            dtype=self.dtype,
			
 
				+        )
			
 
				+
			
 
				+        encoder = self.build_fft(self.config.encoder_layers)
			
 
				+
			
 
				+        variance_adaptor = self.build_variance_adaptor(
			
 
				+            self.config.decoder_frontend_config
			
 
				+        )
			
 
				+
			
 
				+        decoder_frontend = PretsselDecoderFrontend(
			
 
				+            variance_adaptor,
			
 
				+            embed_positions,
			
 
				+            device=self.device,
			
 
				+            dtype=self.dtype,
			
 
				+        )
			
 
				+
			
 
				+        decoder = self.build_fft(self.config.decoder_layers)
			
 
				+
			
 
				+        final_proj = Linear(
			
 
				+            self.config.model_dim,
			
 
				+            self.config.mel_dim,
			
 
				+            bias=True,
			
 
				+            device=self.device,
			
 
				+            dtype=self.dtype,
			
 
				+        )
			
 
				+
			
 
				+        gcmvn_mean = gcmvn_std = None
			
 
				+        if self.config.gcmvn_stats is not None:
			
 
				+            gcmvn_mean = self.config.gcmvn_stats["mean"]
			
 
				+            gcmvn_std = self.config.gcmvn_stats["std"]
			
 
				+
			
 
				+        vocoder = PretsselVocoder(
			
 
				+            encoder_frontend=encoder_frontend,
			
 
				+            encoder=encoder,
			
 
				+            decoder_frontend=decoder_frontend,
			
 
				+            decoder=decoder,
			
 
				+            final_proj=final_proj,
			
 
				+            pn_n_channels=self.config.pn_conv_dim,
			
 
				+            pn_kernel_size=self.config.pn_conv_kernel_size,
			
 
				+            pn_layers=self.config.pn_layers,
			
 
				+            pn_dropout=self.config.pn_dropout,
			
 
				+            upsample_rates=self.config.upsample_rates,
			
 
				+            upsample_kernel_sizes=self.config.upsample_kernel_sizes,
			
 
				+            upsample_initial_channel=self.config.upsample_initial_channel,
			
 
				+            resblock_kernel_sizes=self.config.resblock_kernel_sizes,
			
 
				+            resblock_dilation_sizes=self.config.resblock_dilation_sizes,
			
 
				+            channels=self.config.channels,
			
 
				+            dimension=self.config.dimension,
			
 
				+            n_filters=self.config.n_filters,
			
 
				+            ratios=self.config.ratios,
			
 
				+            norm=self.config.norm,
			
 
				+            norm_params=self.config.norm_params,
			
 
				+            kernel_size=self.config.kernel_size,
			
 
				+            last_kernel_size=self.config.last_kernel_size,
			
 
				+            residual_kernel_size=self.config.residual_kernel_size,
			
 
				+            causal=self.config.causal,
			
 
				+            pad_mode=self.config.pad_mode,
			
 
				+            true_skip=self.config.true_skip,
			
 
				+            compress=self.config.compress,
			
 
				+            lstm=self.config.lstm,
			
 
				+            disable_norm_outer_blocks=self.config.disable_norm_outer_blocks,
			
 
				+            trim_right_ratio=self.config.trim_right_ratio,
			
 
				+            gcmvn_mean=gcmvn_mean,
			
 
				+            gcmvn_std=gcmvn_std,
			
 
				+        )
			
 
				+        vocoder.to(dtype=self.dtype, device=self.device)
			
 
				+        return vocoder
			
 
				+
			
 
				+
			
 
				+def create_vocoder_model(
			
 
				+    config: VocoderConfig,
			
 
				+    device: Optional[Device] = None,
			
 
				+    dtype: Optional[DataType] = None,
			
 
				+) -> PretsselVocoder:
			
 
				+    prosody_encoder_builder = EcapaTDNNBuilder(
			
 
				+        config.encoder_frontend_config.prosody_encoder_config,
			
 
				+        device=device,
			
 
				+        dtype=dtype,
			
 
				+    )
			
 
				+    return PretsselVocoderBuilder(
			
 
				+        config, prosody_encoder_builder, device=device, dtype=dtype
			
 
				+    ).build_model()
			
--- a/src/seamless_communication/models/generator/ecapa_tdnn.py
+++ b/src/seamless_communication/models/generator/ecapa_tdnn.py
@@ -0,0 +1,474 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+from typing import List, Optional, Tuple
			
 
				+
			
 
				+import torch
			
 
				+import torch.nn.functional as F
			
 
				+from fairseq2.nn.padding import PaddingMask, to_padding_mask
			
 
				+from torch import Tensor
			
 
				+from torch.nn import Conv1d, LayerNorm, Module, ModuleList, ReLU, Sigmoid, Tanh, init
			
 
				+
			
 
				+
			
 
				+class ECAPA_TDNN(Module):
			
 
				+    """
			
 
				+    Represents the ECAPA-TDNN model described in paper:
			
 
				+    :cite:t`https://doi.org/10.48550/arxiv.2005.07143`.
			
 
				+
			
 
				+    Arguments
			
 
				+    ---------
			
 
				+    :param channels:
			
 
				+        Output channels for TDNN/SERes2Net layer.
			
 
				+    :param kernel_sizes:
			
 
				+        List of kernel sizes for each layer.
			
 
				+    :param dilations:
			
 
				+        List of dilations for kernels in each layer.
			
 
				+    :param groups:
			
 
				+        List of groups for kernels in each layer.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        channels: List[int],
			
 
				+        kernel_sizes: List[int],
			
 
				+        dilations: List[int],
			
 
				+        attention_channels: int,
			
 
				+        res2net_scale: int,
			
 
				+        se_channels: int,
			
 
				+        global_context: bool,
			
 
				+        groups: List[int],
			
 
				+        embed_dim: int,
			
 
				+        input_dim: int,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        assert len(channels) == len(kernel_sizes) == len(dilations)
			
 
				+        self.channels = channels
			
 
				+        self.embed_dim = embed_dim
			
 
				+        self.blocks = ModuleList()
			
 
				+
			
 
				+        self.blocks.append(
			
 
				+            TDNNBlock(
			
 
				+                input_dim,
			
 
				+                channels[0],
			
 
				+                kernel_sizes[0],
			
 
				+                dilations[0],
			
 
				+                groups[0],
			
 
				+            )
			
 
				+        )
			
 
				+
			
 
				+        # SE-Res2Net layers
			
 
				+        for i in range(1, len(channels) - 1):
			
 
				+            self.blocks.append(
			
 
				+                SERes2NetBlock(
			
 
				+                    channels[i - 1],
			
 
				+                    channels[i],
			
 
				+                    res2net_scale=res2net_scale,
			
 
				+                    se_channels=se_channels,
			
 
				+                    kernel_size=kernel_sizes[i],
			
 
				+                    dilation=dilations[i],
			
 
				+                    groups=groups[i],
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				+        # Multi-layer feature aggregation
			
 
				+        self.mfa = TDNNBlock(
			
 
				+            channels[-1],
			
 
				+            channels[-1],
			
 
				+            kernel_sizes[-1],
			
 
				+            dilations[-1],
			
 
				+            groups=groups[-1],
			
 
				+        )
			
 
				+
			
 
				+        # Attentive Statistical Pooling
			
 
				+        self.asp = AttentiveStatisticsPooling(
			
 
				+            channels[-1],
			
 
				+            attention_channels=attention_channels,
			
 
				+            global_context=global_context,
			
 
				+        )
			
 
				+        self.asp_norm = LayerNorm(channels[-1] * 2, eps=1e-12)
			
 
				+
			
 
				+        # Final linear transformation
			
 
				+        self.fc = Conv1d(
			
 
				+            in_channels=channels[-1] * 2,
			
 
				+            out_channels=embed_dim,
			
 
				+            kernel_size=1,
			
 
				+        )
			
 
				+
			
 
				+        self.reset_parameters()
			
 
				+
			
 
				+    def reset_parameters(self) -> None:
			
 
				+        """Reset the parameters and buffers of the module."""
			
 
				+
			
 
				+        def encoder_init(m: Module) -> None:
			
 
				+            if isinstance(m, Conv1d):
			
 
				+                init.xavier_uniform_(m.weight, init.calculate_gain("relu"))
			
 
				+
			
 
				+        self.apply(encoder_init)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        x: Tensor,
			
 
				+        padding_mask: Optional[PaddingMask] = None,
			
 
				+    ) -> Tensor:
			
 
				+        """Returns the embedding vector.
			
 
				+
			
 
				+        Arguments
			
 
				+        ---------
			
 
				+        x : torch.Tensor
			
 
				+            Tensor of shape (batch, time, channel).
			
 
				+        """
			
 
				+        # Minimize transpose for efficiency
			
 
				+        x = x.transpose(1, 2)
			
 
				+
			
 
				+        xl = []
			
 
				+        for layer in self.blocks:
			
 
				+            x = layer(x, padding_mask=padding_mask)
			
 
				+            xl.append(x)
			
 
				+
			
 
				+        # Multi-layer feature aggregation
			
 
				+        x = torch.cat(xl[1:], dim=1)
			
 
				+        x = self.mfa(x)
			
 
				+
			
 
				+        # Attentive Statistical Pooling
			
 
				+        x = self.asp(x, padding_mask=padding_mask)
			
 
				+        x = self.asp_norm(x.transpose(1, 2)).transpose(1, 2)
			
 
				+
			
 
				+        # Final linear transformation
			
 
				+        x = self.fc(x)
			
 
				+
			
 
				+        x = x.transpose(1, 2).squeeze(1)  # B x C
			
 
				+        return F.normalize(x, dim=-1)
			
 
				+
			
 
				+
			
 
				+class TDNNBlock(Module):
			
 
				+    """An implementation of TDNN.
			
 
				+
			
 
				+    Arguments
			
 
				+    ----------
			
 
				+    :param in_channels : int
			
 
				+        Number of input channels.
			
 
				+    :param out_channels : int
			
 
				+        The number of output channels.
			
 
				+    :param kernel_size : int
			
 
				+        The kernel size of the TDNN blocks.
			
 
				+    :param dilation : int
			
 
				+        The dilation of the TDNN block.
			
 
				+    :param groups: int
			
 
				+        The groups size of the TDNN blocks.
			
 
				+
			
 
				+    Example
			
 
				+    -------
			
 
				+    >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
			
 
				+    >>> layer = TDNNBlock(64, 64, kernel_size=3, dilation=1)
			
 
				+    >>> out_tensor = layer(inp_tensor).transpose(1, 2)
			
 
				+    >>> out_tensor.shape
			
 
				+    torch.Size([8, 120, 64])
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        in_channels: int,
			
 
				+        out_channels: int,
			
 
				+        kernel_size: int,
			
 
				+        dilation: int,
			
 
				+        groups: int = 1,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.conv = Conv1d(
			
 
				+            in_channels=in_channels,
			
 
				+            out_channels=out_channels,
			
 
				+            kernel_size=kernel_size,
			
 
				+            dilation=dilation,
			
 
				+            padding=dilation * (kernel_size - 1) // 2,
			
 
				+            groups=groups,
			
 
				+        )
			
 
				+        self.activation = ReLU()
			
 
				+        self.norm = LayerNorm(out_channels, eps=1e-12)
			
 
				+
			
 
				+    def forward(self, x: Tensor, padding_mask: Optional[PaddingMask] = None) -> Tensor:
			
 
				+        """Processes the input tensor x and returns an output tensor."""
			
 
				+        x = self.activation(self.conv(x))
			
 
				+
			
 
				+        return self.norm(x.transpose(1, 2)).transpose(1, 2)  # type: ignore[no-any-return]
			
 
				+
			
 
				+
			
 
				+class Res2NetBlock(Module):
			
 
				+    """An implementation of Res2NetBlock w/ dilation.
			
 
				+
			
 
				+    Arguments
			
 
				+    ---------
			
 
				+    :param in_channels : int
			
 
				+        The number of channels expected in the input.
			
 
				+    :param out_channels : int
			
 
				+        The number of output channels.
			
 
				+    :param scale : int
			
 
				+        The scale of the Res2Net block.
			
 
				+    :param kernel_size: int
			
 
				+        The kernel size of the Res2Net block.
			
 
				+    :param dilation : int
			
 
				+        The dilation of the Res2Net block.
			
 
				+
			
 
				+    Example
			
 
				+    -------
			
 
				+    >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
			
 
				+    >>> layer = Res2NetBlock(64, 64, scale=4, dilation=3)
			
 
				+    >>> out_tensor = layer(inp_tensor).transpose(1, 2)
			
 
				+    >>> out_tensor.shape
			
 
				+    torch.Size([8, 120, 64])
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        in_channels: int,
			
 
				+        out_channels: int,
			
 
				+        scale: int = 8,
			
 
				+        kernel_size: int = 3,
			
 
				+        dilation: int = 1,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        assert in_channels % scale == 0
			
 
				+        assert out_channels % scale == 0
			
 
				+
			
 
				+        in_channel = in_channels // scale
			
 
				+        hidden_channel = out_channels // scale
			
 
				+        self.blocks = ModuleList(
			
 
				+            [
			
 
				+                TDNNBlock(
			
 
				+                    in_channel,
			
 
				+                    hidden_channel,
			
 
				+                    kernel_size=kernel_size,
			
 
				+                    dilation=dilation,
			
 
				+                )
			
 
				+                for i in range(scale - 1)
			
 
				+            ]
			
 
				+        )
			
 
				+        self.scale = scale
			
 
				+
			
 
				+    def forward(self, x: Tensor) -> Tensor:
			
 
				+        """Processes the input tensor x and returns an output tensor."""
			
 
				+        y = []
			
 
				+        for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)):
			
 
				+            if i == 0:
			
 
				+                y_i = x_i
			
 
				+            elif i == 1:
			
 
				+                y_i = self.blocks[i - 1](x_i)
			
 
				+            else:
			
 
				+                y_i = self.blocks[i - 1](x_i + y_i)
			
 
				+            y.append(y_i)
			
 
				+
			
 
				+        y_tensor = torch.cat(y, dim=1)
			
 
				+        return y_tensor
			
 
				+
			
 
				+
			
 
				+class SEBlock(Module):
			
 
				+    """An implementation of squeeze-and-excitation block.
			
 
				+
			
 
				+    Arguments
			
 
				+    ---------
			
 
				+    in_channels : int
			
 
				+        The number of input channels.
			
 
				+    se_channels : int
			
 
				+        The number of output channels after squeeze.
			
 
				+    out_channels : int
			
 
				+        The number of output channels.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        in_channels: int,
			
 
				+        se_channels: int,
			
 
				+        out_channels: int,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.conv1 = Conv1d(
			
 
				+            in_channels=in_channels, out_channels=se_channels, kernel_size=1
			
 
				+        )
			
 
				+        self.relu = ReLU(inplace=True)
			
 
				+        self.conv2 = Conv1d(
			
 
				+            in_channels=se_channels, out_channels=out_channels, kernel_size=1
			
 
				+        )
			
 
				+        self.sigmoid = Sigmoid()
			
 
				+
			
 
				+    def forward(self, x: Tensor, padding_mask: Optional[PaddingMask] = None) -> Tensor:
			
 
				+        """Processes the input tensor x and returns an output tensor."""
			
 
				+        if padding_mask is not None:
			
 
				+            mask = padding_mask.materialize().unsqueeze(1)
			
 
				+            s = (x * mask).sum(dim=2, keepdim=True) / padding_mask.seq_lens[
			
 
				+                :, None, None
			
 
				+            ]
			
 
				+        else:
			
 
				+            s = x.mean(dim=2, keepdim=True)
			
 
				+
			
 
				+        s = self.relu(self.conv1(s))
			
 
				+        s = self.sigmoid(self.conv2(s))
			
 
				+
			
 
				+        return s * x
			
 
				+
			
 
				+
			
 
				+class AttentiveStatisticsPooling(Module):
			
 
				+    """This class implements an attentive statistic pooling layer for each channel.
			
 
				+    It returns the concatenated mean and std of the input tensor.
			
 
				+
			
 
				+    Arguments
			
 
				+    ---------
			
 
				+    channels: int
			
 
				+        The number of input channels.
			
 
				+    attention_channels: int
			
 
				+        The number of attention channels.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self, channels: int, attention_channels: int = 128, global_context: bool = True
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.eps = 1e-12
			
 
				+        self.global_context = global_context
			
 
				+        if global_context:
			
 
				+            self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1)
			
 
				+        else:
			
 
				+            self.tdnn = TDNNBlock(channels, attention_channels, 1, 1)
			
 
				+
			
 
				+        self.tanh = Tanh()
			
 
				+        self.conv = Conv1d(
			
 
				+            in_channels=attention_channels, out_channels=channels, kernel_size=1
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, x: Tensor, padding_mask: Optional[PaddingMask] = None) -> Tensor:
			
 
				+        """Calculates mean and std for a batch (input tensor).
			
 
				+
			
 
				+        Arguments
			
 
				+        ---------
			
 
				+        x : torch.Tensor
			
 
				+            Tensor of shape [N, C, L].
			
 
				+        """
			
 
				+        L = x.shape[-1]
			
 
				+
			
 
				+        def _compute_statistics(
			
 
				+            x: Tensor, m: Tensor, dim: int = 2, eps: float = self.eps
			
 
				+        ) -> Tuple[Tensor, Tensor]:
			
 
				+            mean = (m * x).sum(dim)
			
 
				+            std = torch.sqrt((m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps))
			
 
				+            return mean, std
			
 
				+
			
 
				+        # Make binary mask of shape [N, 1, L]
			
 
				+        # mask = to_padding_mask(lengths, max(lengths))
			
 
				+        if padding_mask is not None:
			
 
				+            mask = padding_mask.materialize()
			
 
				+        else:
			
 
				+            mask = to_padding_mask(torch.IntTensor([L]), L).repeat(x.shape[0], 1).to(x)
			
 
				+        mask = mask.unsqueeze(1)
			
 
				+
			
 
				+        # Expand the temporal context of the pooling layer by allowing the
			
 
				+        # self-attention to look at global properties of the utterance.
			
 
				+        if self.global_context:
			
 
				+            # torch.std is unstable for backward computation
			
 
				+            # https://github.com/pytorch/pytorch/issues/4320
			
 
				+            total = mask.sum(dim=2, keepdim=True).to(x)
			
 
				+            mean, std = _compute_statistics(x, mask / total)
			
 
				+            mean = mean.unsqueeze(2).repeat(1, 1, L)
			
 
				+            std = std.unsqueeze(2).repeat(1, 1, L)
			
 
				+            attn = torch.cat([x, mean, std], dim=1)
			
 
				+        else:
			
 
				+            attn = x
			
 
				+
			
 
				+        # Apply layers
			
 
				+        attn = self.conv(self.tanh(self.tdnn(attn)))
			
 
				+
			
 
				+        # Filter out zero-paddings
			
 
				+        attn = attn.masked_fill(mask == 0, float("-inf"))
			
 
				+
			
 
				+        attn = F.softmax(attn, dim=2)
			
 
				+        mean, std = _compute_statistics(x, attn)
			
 
				+        # Append mean and std of the batch
			
 
				+        pooled_stats = torch.cat((mean, std), dim=1)
			
 
				+        pooled_stats = pooled_stats.unsqueeze(2)
			
 
				+
			
 
				+        return pooled_stats
			
 
				+
			
 
				+
			
 
				+class SERes2NetBlock(Module):
			
 
				+    """An implementation of building block in ECAPA-TDNN, i.e.,
			
 
				+    TDNN-Res2Net-TDNN-SEBlock.
			
 
				+
			
 
				+    Arguments
			
 
				+    ----------
			
 
				+    out_channels: int
			
 
				+        The number of output channels.
			
 
				+    res2net_scale: int
			
 
				+        The scale of the Res2Net block.
			
 
				+    kernel_size: int
			
 
				+        The kernel size of the TDNN blocks.
			
 
				+    dilation: int
			
 
				+        The dilation of the Res2Net block.
			
 
				+    groups: int
			
 
				+    Number of blocked connections from input channels to output channels.
			
 
				+
			
 
				+    Example
			
 
				+    -------
			
 
				+    >>> x = torch.rand(8, 120, 64).transpose(1, 2)
			
 
				+    >>> conv = SERes2NetBlock(64, 64, res2net_scale=4)
			
 
				+    >>> out = conv(x).transpose(1, 2)
			
 
				+    >>> out.shape
			
 
				+    torch.Size([8, 120, 64])
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        in_channels: int,
			
 
				+        out_channels: int,
			
 
				+        res2net_scale: int = 8,
			
 
				+        se_channels: int = 128,
			
 
				+        kernel_size: int = 1,
			
 
				+        dilation: int = 1,
			
 
				+        groups: int = 1,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.out_channels = out_channels
			
 
				+        self.tdnn1 = TDNNBlock(
			
 
				+            in_channels,
			
 
				+            out_channels,
			
 
				+            kernel_size=1,
			
 
				+            dilation=1,
			
 
				+            groups=groups,
			
 
				+        )
			
 
				+        self.res2net_block = Res2NetBlock(
			
 
				+            out_channels,
			
 
				+            out_channels,
			
 
				+            res2net_scale,
			
 
				+            kernel_size,
			
 
				+            dilation,
			
 
				+        )
			
 
				+        self.tdnn2 = TDNNBlock(
			
 
				+            out_channels,
			
 
				+            out_channels,
			
 
				+            kernel_size=1,
			
 
				+            dilation=1,
			
 
				+            groups=groups,
			
 
				+        )
			
 
				+        self.se_block = SEBlock(out_channels, se_channels, out_channels)
			
 
				+
			
 
				+        self.shortcut = None
			
 
				+        if in_channels != out_channels:
			
 
				+            self.shortcut = Conv1d(
			
 
				+                in_channels=in_channels,
			
 
				+                out_channels=out_channels,
			
 
				+                kernel_size=1,
			
 
				+            )
			
 
				+
			
 
				+    def forward(self, x: Tensor, padding_mask: Optional[PaddingMask] = None) -> Tensor:
			
 
				+        """Processes the input tensor x and returns an output tensor."""
			
 
				+        residual = x
			
 
				+        if self.shortcut:
			
 
				+            residual = self.shortcut(x)
			
 
				+
			
 
				+        x = self.tdnn1(x)
			
 
				+        x = self.res2net_block(x)
			
 
				+        x = self.tdnn2(x)
			
 
				+        x = self.se_block(x, padding_mask=padding_mask)
			
 
				+
			
 
				+        return x + residual
			
--- a/src/seamless_communication/models/generator/ecapa_tdnn_builder.py
+++ b/src/seamless_communication/models/generator/ecapa_tdnn_builder.py
@@ -0,0 +1,112 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+from dataclasses import dataclass
			
 
				+from typing import List, Optional
			
 
				+
			
 
				+from fairseq2.models.utils.arch_registry import ArchitectureRegistry
			
 
				+from fairseq2.typing import DataType, Device
			
 
				+
			
 
				+from seamless_communication.models.generator.ecapa_tdnn import ECAPA_TDNN
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class EcapaTDNNConfig:
			
 
				+    channels: List[int]
			
 
				+    kernel_sizes: List[int]
			
 
				+    dilations: List[int]
			
 
				+    attention_channels: int
			
 
				+    res2net_scale: int
			
 
				+    se_channels: int
			
 
				+    global_context: bool
			
 
				+    groups: List[int]
			
 
				+    embed_dim: int
			
 
				+    input_dim: int
			
 
				+
			
 
				+
			
 
				+ecapa_tdnn_archs = ArchitectureRegistry[EcapaTDNNConfig]("ecapa_tdnn")
			
 
				+
			
 
				+ecapa_tdnn_arch = ecapa_tdnn_archs.decorator
			
 
				+
			
 
				+
			
 
				+@ecapa_tdnn_arch("base")
			
 
				+def _base_ecapa_tdnn() -> EcapaTDNNConfig:
			
 
				+    return EcapaTDNNConfig(
			
 
				+        channels=[512, 512, 512, 512, 1536],
			
 
				+        kernel_sizes=[5, 3, 3, 3, 1],
			
 
				+        dilations=[1, 2, 3, 4, 1],
			
 
				+        attention_channels=128,
			
 
				+        res2net_scale=8,
			
 
				+        se_channels=128,
			
 
				+        global_context=True,
			
 
				+        groups=[1, 1, 1, 1, 1],
			
 
				+        embed_dim=512,
			
 
				+        input_dim=80,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+class EcapaTDNNBuilder:
			
 
				+    """
			
 
				+    Builder module for ECAPA_TDNN model
			
 
				+    """
			
 
				+
			
 
				+    config: EcapaTDNNConfig
			
 
				+    device: Optional[Device]
			
 
				+    dtype: Optional[DataType]
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: EcapaTDNNConfig,
			
 
				+        *,
			
 
				+        device: Optional[Device] = None,
			
 
				+        dtype: Optional[DataType] = None,
			
 
				+    ) -> None:
			
 
				+        """
			
 
				+        :param config:
			
 
				+            The configuration to use.
			
 
				+        :param devicev:
			
 
				+            The device on which to initialize modules.
			
 
				+        :param dtype:
			
 
				+            The data type of module parameters and buffers.
			
 
				+        """
			
 
				+        self.config = config
			
 
				+
			
 
				+        self.device, self.dtype = device, dtype
			
 
				+
			
 
				+    def build_model(self) -> ECAPA_TDNN:
			
 
				+        """Build a model."""
			
 
				+        model = ECAPA_TDNN(
			
 
				+            self.config.channels,
			
 
				+            self.config.kernel_sizes,
			
 
				+            self.config.dilations,
			
 
				+            self.config.attention_channels,
			
 
				+            self.config.res2net_scale,
			
 
				+            self.config.se_channels,
			
 
				+            self.config.global_context,
			
 
				+            self.config.groups,
			
 
				+            self.config.embed_dim,
			
 
				+            self.config.input_dim,
			
 
				+        )
			
 
				+        model.to(device=self.device, dtype=self.dtype)
			
 
				+        return model
			
 
				+
			
 
				+
			
 
				+def create_ecapa_tdnn_model(
			
 
				+    config: EcapaTDNNConfig,
			
 
				+    device: Optional[Device] = None,
			
 
				+    dtype: Optional[DataType] = None,
			
 
				+) -> ECAPA_TDNN:
			
 
				+    """Create a ECAPA_TDNN model.
			
 
				+
			
 
				+    :param config:
			
 
				+        The configuration to use.
			
 
				+    :param device:
			
 
				+        The device on which to initialize modules.
			
 
				+    :param dtype:
			
 
				+        The data type of module parameters and buffers.
			
 
				+    """
			
 
				+
			
 
				+    return EcapaTDNNBuilder(config, device=device, dtype=dtype).build_model()
			
--- a/src/seamless_communication/models/generator/loader.py
+++ b/src/seamless_communication/models/generator/loader.py
@@ -0,0 +1,29 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+
			
 
				+from typing import Any, Mapping
			
 
				+
			
 
				+from fairseq2.assets import asset_store, download_manager
			
 
				+from fairseq2.models.utils import ConfigLoader, ModelLoader
			
 
				+
			
 
				+from seamless_communication.models.generator.builder import (
			
 
				+    VocoderConfig,
			
 
				+    create_vocoder_model,
			
 
				+    vocoder_archs,
			
 
				+)
			
 
				+from seamless_communication.models.generator.vocoder import PretsselVocoder
			
 
				+
			
 
				+load_pretssel_vocoder_config = ConfigLoader[VocoderConfig](asset_store, vocoder_archs)
			
 
				+
			
 
				+
			
 
				+load_pretssel_vocoder_model = ModelLoader[PretsselVocoder, VocoderConfig](
			
 
				+    asset_store,
			
 
				+    download_manager,
			
 
				+    load_pretssel_vocoder_config,
			
 
				+    create_vocoder_model,
			
 
				+    restrict_checkpoints=False,
			
 
				+)
			
--- a/src/seamless_communication/models/generator/streamable.py
+++ b/src/seamless_communication/models/generator/streamable.py
@@ -0,0 +1,452 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+import math
			
 
				+import warnings
			
 
				+from typing import Any, Dict, List, Literal, Optional, Tuple, TypeVar
			
 
				+
			
 
				+import torch
			
 
				+from fairseq2.typing import DataType, Device
			
 
				+from torch.nn import (
			
 
				+    ELU,
			
 
				+    LSTM,
			
 
				+    Conv1d,
			
 
				+    ConvTranspose1d,
			
 
				+    GroupNorm,
			
 
				+    Identity,
			
 
				+    Module,
			
 
				+    Sequential,
			
 
				+)
			
 
				+from torch.nn import functional as F
			
 
				+from torch.nn.utils import spectral_norm, weight_norm  # type: ignore[attr-defined]
			
 
				+
			
 
				+CONV_NORMALIZATIONS = frozenset(
			
 
				+    ["none", "weight_norm", "spectral_norm", "time_group_norm"]
			
 
				+)
			
 
				+
			
 
				+
			
 
				+def apply_parametrization_norm(
			
 
				+    module: Module,
			
 
				+    norm: Literal["none", "weight_norm", "spectral_norm", "time_group_norm"] = "none",
			
 
				+) -> Module:
			
 
				+    if norm == "weight_norm":
			
 
				+        return weight_norm(module)
			
 
				+    elif norm == "spectral_norm":
			
 
				+        return spectral_norm(module)
			
 
				+    else:
			
 
				+        # We already check was in CONV_NORMALIZATION, so any other choice
			
 
				+        # doesn't need reparametrization.
			
 
				+        return module
			
 
				+
			
 
				+
			
 
				+def get_norm_module(  # type: ignore[no-untyped-def]
			
 
				+    module: Module,
			
 
				+    causal: bool = False,
			
 
				+    norm: Literal["none", "weight_norm", "spectral_norm", "time_group_norm"] = "none",
			
 
				+    **norm_kwargs,
			
 
				+) -> Module:
			
 
				+    """Return the proper normalization module. If causal is True, this will ensure the returned
			
 
				+    module is causal, or return an error if the normalization doesn't support causal evaluation.
			
 
				+    """
			
 
				+    assert norm in CONV_NORMALIZATIONS
			
 
				+    if norm == "time_group_norm":
			
 
				+        if causal:
			
 
				+            raise ValueError("GroupNorm doesn't support causal evaluation.")
			
 
				+        assert isinstance(module, torch.nn.modules.conv._ConvNd)
			
 
				+        return GroupNorm(1, module.out_channels, **norm_kwargs)
			
 
				+    else:
			
 
				+        return Identity()
			
 
				+
			
 
				+
			
 
				+def get_extra_padding_for_conv1d(
			
 
				+    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
			
 
				+) -> int:
			
 
				+    """See `pad_for_conv1d`."""
			
 
				+    length = x.shape[-1]
			
 
				+    n_frames = (length - kernel_size + padding_total) / stride + 1
			
 
				+    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
			
 
				+    return ideal_length - length
			
 
				+
			
 
				+
			
 
				+def pad_for_conv1d(
			
 
				+    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
			
 
				+) -> torch.Tensor:
			
 
				+    """Pad for a convolution to make sure that the last window is full.
			
 
				+    Extra padding is added at the end. This is required to ensure that we can rebuild
			
 
				+    an output of the same length, as otherwise, even with padding, some time steps
			
 
				+    might get removed.
			
 
				+    For instance, with total padding = 4, kernel size = 4, stride = 2:
			
 
				+        0 0 1 2 3 4 5 0 0   # (0s are padding)
			
 
				+        1   2   3           # (output frames of a convolution, last 0 is never used)
			
 
				+        0 0 1 2 3 4 5 0     # (output of tr. conv., but pos. 5 is going to get removed as padding)
			
 
				+            1 2 3 4         # once you removed padding, we are missing one time step !
			
 
				+    """
			
 
				+    extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
			
 
				+    return F.pad(x, (0, extra_padding))  # noqa
			
 
				+
			
 
				+
			
 
				+def pad1d(
			
 
				+    x: torch.Tensor,
			
 
				+    paddings: Tuple[int, int],
			
 
				+    mode: str = "constant",
			
 
				+    value: float = 0.0,
			
 
				+) -> torch.Tensor:
			
 
				+    """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
			
 
				+    If this is the case, we insert extra 0 padding to the right before the reflection happen.
			
 
				+    """
			
 
				+    length = x.shape[-1]
			
 
				+    padding_left, padding_right = paddings
			
 
				+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
			
 
				+    if mode == "reflect":
			
 
				+        max_pad = max(padding_left, padding_right)
			
 
				+        extra_pad = 0
			
 
				+        if length <= max_pad:
			
 
				+            extra_pad = max_pad - length + 1
			
 
				+            x = F.pad(x, (0, extra_pad))
			
 
				+        padded = F.pad(x, paddings, mode, value)
			
 
				+        end = padded.shape[-1] - extra_pad
			
 
				+        return padded[..., :end]
			
 
				+    else:
			
 
				+        return F.pad(x, paddings, mode, value)
			
 
				+
			
 
				+
			
 
				+def unpad1d(x: torch.Tensor, paddings: Tuple[int, int]) -> torch.Tensor:
			
 
				+    """Remove padding from x, handling properly zero padding. Only for 1d!"""
			
 
				+    padding_left, padding_right = paddings
			
 
				+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
			
 
				+    assert (padding_left + padding_right) <= x.shape[-1]
			
 
				+    end = x.shape[-1] - padding_right
			
 
				+    return x[..., padding_left:end]
			
 
				+
			
 
				+
			
 
				+class NormConv1d(Module):
			
 
				+    """Wrapper around Conv1d and normalization applied to this conv
			
 
				+    to provide a uniform interface across normalization approaches.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        in_channels: int,
			
 
				+        out_channels: int,
			
 
				+        kernel_size: int,
			
 
				+        stride: int = 1,
			
 
				+        dilation: int = 1,
			
 
				+        groups: int = 1,
			
 
				+        bias: bool = True,
			
 
				+        causal: bool = False,
			
 
				+        norm: Literal[
			
 
				+            "none", "weight_norm", "spectral_norm", "time_group_norm"
			
 
				+        ] = "none",
			
 
				+        norm_kwargs: Dict[str, Any] = {},
			
 
				+        device: Optional[Device] = None,
			
 
				+        dtype: Optional[DataType] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.conv: Module = apply_parametrization_norm(
			
 
				+            Conv1d(
			
 
				+                in_channels,
			
 
				+                out_channels,
			
 
				+                kernel_size,
			
 
				+                stride,
			
 
				+                dilation=dilation,
			
 
				+                groups=groups,
			
 
				+                bias=bias,
			
 
				+                device=device,
			
 
				+                dtype=dtype,
			
 
				+            ),
			
 
				+            norm,
			
 
				+        )
			
 
				+        self.norm: Module = get_norm_module(self.conv, causal, norm, **norm_kwargs)
			
 
				+        self.norm_type = norm
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        x = self.conv(x)
			
 
				+        x = self.norm(x)
			
 
				+        return x
			
 
				+
			
 
				+
			
 
				+class NormConvTranspose1d(Module):
			
 
				+    """Wrapper around ConvTranspose1d and normalization applied to this conv
			
 
				+    to provide a uniform interface across normalization approaches.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(  # type: ignore[no-untyped-def]
			
 
				+        self,
			
 
				+        in_channels: int,
			
 
				+        out_channels: int,
			
 
				+        kernel_size: int,
			
 
				+        stride: int = 1,
			
 
				+        causal: bool = False,
			
 
				+        norm: Literal[
			
 
				+            "none", "weight_norm", "spectral_norm", "time_group_norm"
			
 
				+        ] = "none",
			
 
				+        norm_kwargs: Dict[str, Any] = {},
			
 
				+        device: Optional[Device] = None,
			
 
				+        dtype: Optional[DataType] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.convtr = apply_parametrization_norm(
			
 
				+            ConvTranspose1d(
			
 
				+                in_channels=in_channels,
			
 
				+                out_channels=out_channels,
			
 
				+                kernel_size=kernel_size,
			
 
				+                stride=stride,
			
 
				+                device=device,
			
 
				+                dtype=dtype,
			
 
				+            ),
			
 
				+            norm,
			
 
				+        )
			
 
				+        self.norm = get_norm_module(self.convtr, causal, norm, **norm_kwargs)
			
 
				+        self.norm_type = norm
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        x = self.convtr(x)
			
 
				+        x = self.norm(x)
			
 
				+        return x
			
 
				+
			
 
				+
			
 
				+class StreamableConv1d(Module):
			
 
				+    """Conv1d with some builtin handling of asymmetric or causal padding
			
 
				+    and normalization.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        in_channels: int,
			
 
				+        out_channels: int,
			
 
				+        kernel_size: int,
			
 
				+        stride: int = 1,
			
 
				+        dilation: int = 1,
			
 
				+        groups: int = 1,
			
 
				+        bias: bool = True,
			
 
				+        causal: bool = False,
			
 
				+        norm: Literal[
			
 
				+            "none", "weight_norm", "spectral_norm", "time_group_norm"
			
 
				+        ] = "none",
			
 
				+        norm_kwargs: Dict[str, Any] = {},
			
 
				+        pad_mode: str = "reflect",
			
 
				+        activation: Optional[Module] = None,
			
 
				+        device: Optional[Device] = None,
			
 
				+        dtype: Optional[DataType] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        # warn user on unusual setup between dilation and stride
			
 
				+        if stride > 1 and dilation > 1:
			
 
				+            warnings.warn(
			
 
				+                "StreamableConv1d has been initialized with stride > 1 and dilation > 1"
			
 
				+                f" (kernel_size={kernel_size} stride={stride}, dilation={dilation})."
			
 
				+            )
			
 
				+        self.activation = activation
			
 
				+        self.conv = NormConv1d(
			
 
				+            in_channels,
			
 
				+            out_channels,
			
 
				+            kernel_size,
			
 
				+            stride,
			
 
				+            dilation=dilation,
			
 
				+            groups=groups,
			
 
				+            bias=bias,
			
 
				+            causal=causal,
			
 
				+            norm=norm,
			
 
				+            norm_kwargs=norm_kwargs,
			
 
				+            device=device,
			
 
				+            dtype=dtype,
			
 
				+        )
			
 
				+        self.causal = causal
			
 
				+        self.pad_mode = pad_mode
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        if self.activation:
			
 
				+            x = self.activation(x)
			
 
				+        kernel_size: int = self.conv.conv.kernel_size[0]  # type: ignore[index,assignment]
			
 
				+        stride: int = self.conv.conv.stride[0]  # type: ignore[index,assignment]
			
 
				+        dilation = self.conv.conv.dilation[0]  # type: ignore[index]
			
 
				+        kernel_size = (  # type: ignore[assignment]
			
 
				+            kernel_size - 1
			
 
				+        ) * dilation + 1  # effective kernel size with dilations
			
 
				+        padding_total = kernel_size - stride
			
 
				+        extra_padding = get_extra_padding_for_conv1d(
			
 
				+            x, kernel_size, stride, padding_total
			
 
				+        )
			
 
				+        if self.causal:
			
 
				+            # Left padding for causal
			
 
				+            x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode)
			
 
				+        else:
			
 
				+            # Asymmetric padding required for odd strides
			
 
				+            padding_right = padding_total // 2
			
 
				+            padding_left = padding_total - padding_right
			
 
				+            x = pad1d(
			
 
				+                x, (padding_left, padding_right + extra_padding), mode=self.pad_mode
			
 
				+            )
			
 
				+        return self.conv(x)  # type: ignore[no-any-return]
			
 
				+
			
 
				+
			
 
				+class StreamableConvTranspose1d(Module):
			
 
				+    """ConvTranspose1d with some builtin handling of asymmetric or causal padding
			
 
				+    and normalization.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        in_channels: int,
			
 
				+        out_channels: int,
			
 
				+        kernel_size: int,
			
 
				+        stride: int = 1,
			
 
				+        causal: bool = False,
			
 
				+        norm: Literal[
			
 
				+            "none", "weight_norm", "spectral_norm", "time_group_norm"
			
 
				+        ] = "none",
			
 
				+        trim_right_ratio: float = 1.0,
			
 
				+        norm_kwargs: Dict[str, Any] = {},
			
 
				+        device: Optional[Device] = None,
			
 
				+        dtype: Optional[DataType] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.convtr = NormConvTranspose1d(
			
 
				+            in_channels,
			
 
				+            out_channels,
			
 
				+            kernel_size,
			
 
				+            stride,
			
 
				+            causal=causal,
			
 
				+            norm=norm,
			
 
				+            norm_kwargs=norm_kwargs,
			
 
				+            device=device,
			
 
				+            dtype=dtype,
			
 
				+        )
			
 
				+        self.causal = causal
			
 
				+        self.trim_right_ratio = trim_right_ratio
			
 
				+        assert (
			
 
				+            self.causal or self.trim_right_ratio == 1.0
			
 
				+        ), "`trim_right_ratio` != 1.0 only makes sense for causal convolutions"
			
 
				+        assert self.trim_right_ratio >= 0.0 and self.trim_right_ratio <= 1.0
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        kernel_size: int = self.convtr.convtr.kernel_size[0]  # type: ignore[index,assignment]
			
 
				+        stride: int = self.convtr.convtr.stride[0]  # type: ignore[index,assignment]
			
 
				+        padding_total = kernel_size - stride
			
 
				+
			
 
				+        y: torch.Tensor = self.convtr(x)
			
 
				+
			
 
				+        # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
			
 
				+        # removed at the very end, when keeping only the right length for the output,
			
 
				+        # as removing it here would require also passing the length at the matching layer
			
 
				+        # in the encoder.
			
 
				+        if self.causal:
			
 
				+            # Trim the padding on the right according to the specified ratio
			
 
				+            # if trim_right_ratio = 1.0, trim everything from right
			
 
				+            padding_right = math.ceil(padding_total * self.trim_right_ratio)
			
 
				+            padding_left = padding_total - padding_right
			
 
				+            y = unpad1d(y, (padding_left, padding_right))
			
 
				+        else:
			
 
				+            # Asymmetric padding required for odd strides
			
 
				+            padding_right = padding_total // 2
			
 
				+            padding_left = padding_total - padding_right
			
 
				+            y = unpad1d(y, (padding_left, padding_right))
			
 
				+        return y
			
 
				+
			
 
				+
			
 
				+class StreamableLSTM(Module):
			
 
				+    """LSTM without worrying about the hidden state, nor the layout of the data.
			
 
				+    Expects input as convolutional layout.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        dimension: int,
			
 
				+        num_layers: int = 2,
			
 
				+        skip: bool = True,
			
 
				+        device: Optional[Device] = None,
			
 
				+        dtype: Optional[DataType] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.skip = skip
			
 
				+        self.lstm = LSTM(dimension, dimension, num_layers, device=device, dtype=dtype)
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        x = x.permute(2, 0, 1)
			
 
				+        y, _ = self.lstm(x)
			
 
				+        if self.skip:
			
 
				+            y = y + x
			
 
				+        y = y.permute(1, 2, 0)
			
 
				+        return y  # type: ignore[no-any-return]
			
 
				+
			
 
				+
			
 
				+class StreamableResnetBlock(Module):
			
 
				+    """custom Residual block model with streamable convnet.
			
 
				+
			
 
				+    Args:
			
 
				+        dim (int): Dimension of the input/output.
			
 
				+        kernel_sizes (list): List of kernel sizes for the convolutions.
			
 
				+        dilations (list): List of dilations for the convolutions.
			
 
				+        activation_params (dict): Parameters to provide to the (ELU) activation function.
			
 
				+        norm (str): Normalization method.
			
 
				+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
			
 
				+        causal (bool): Whether to use fully causal convolution.
			
 
				+        pad_mode (str): Padding mode for the convolutions.
			
 
				+        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
			
 
				+        true_skip (bool): Whether to use true skip connection or a simple
			
 
				+            (streamable) convolution as the skip connection.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        dim: int,
			
 
				+        kernel_sizes: List[int] = [3, 1],
			
 
				+        dilations: List[int] = [1, 1],
			
 
				+        activation_params: Dict[str, Any] = {"alpha": 1.0},
			
 
				+        norm: Literal[
			
 
				+            "none", "weight_norm", "spectral_norm", "time_group_norm"
			
 
				+        ] = "none",
			
 
				+        norm_params: Dict[str, Any] = {},
			
 
				+        causal: bool = False,
			
 
				+        pad_mode: str = "reflect",
			
 
				+        compress: int = 2,
			
 
				+        true_skip: bool = True,
			
 
				+        device: Optional[Device] = None,
			
 
				+        dtype: Optional[DataType] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        assert len(kernel_sizes) == len(
			
 
				+            dilations
			
 
				+        ), "Number of kernel sizes should match number of dilations"
			
 
				+        hidden = dim // compress
			
 
				+        block = []
			
 
				+        for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
			
 
				+            in_chs = dim if i == 0 else hidden
			
 
				+            out_chs = dim if i == len(kernel_sizes) - 1 else hidden
			
 
				+            block += [
			
 
				+                ELU(**activation_params),
			
 
				+                StreamableConv1d(
			
 
				+                    in_chs,
			
 
				+                    out_chs,
			
 
				+                    kernel_size=kernel_size,
			
 
				+                    dilation=dilation,
			
 
				+                    norm=norm,
			
 
				+                    norm_kwargs=norm_params,
			
 
				+                    causal=causal,
			
 
				+                    pad_mode=pad_mode,
			
 
				+                    device=device,
			
 
				+                    dtype=dtype,
			
 
				+                ),
			
 
				+            ]
			
 
				+        self.block = Sequential(*block)
			
 
				+        self.shortcut: Module
			
 
				+        if true_skip:
			
 
				+            self.shortcut = Identity()
			
 
				+        else:
			
 
				+            self.shortcut = StreamableConv1d(
			
 
				+                dim,
			
 
				+                dim,
			
 
				+                kernel_size=1,
			
 
				+                norm=norm,
			
 
				+                norm_kwargs=norm_params,
			
 
				+                causal=causal,
			
 
				+                pad_mode=pad_mode,
			
 
				+                device=device,
			
 
				+                dtype=dtype,
			
 
				+            )
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        return self.shortcut(x) + self.block(x)  # type: ignore[no-any-return]
			
--- a/src/seamless_communication/models/generator/vocoder.py
+++ b/src/seamless_communication/models/generator/vocoder.py
@@ -0,0 +1,582 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+from typing import Any, Dict, List, Literal, Optional, Tuple
			
 
				+
			
 
				+import torch
			
 
				+import torch.nn.functional as F
			
 
				+from fairseq2.nn.embedding import Embedding, StandardEmbedding
			
 
				+from fairseq2.nn.padding import PaddingMask
			
 
				+from fairseq2.nn.position_encoder import PositionEncoder
			
 
				+from fairseq2.nn.projection import Projection
			
 
				+from fairseq2.typing import DataType, Device
			
 
				+from torch.nn import (
			
 
				+    ELU,
			
 
				+    BatchNorm1d,
			
 
				+    Conv1d,
			
 
				+    ConvTranspose1d,
			
 
				+    Dropout,
			
 
				+    Module,
			
 
				+    ModuleList,
			
 
				+    Parameter,
			
 
				+    Sequential,
			
 
				+    Tanh,
			
 
				+    init,
			
 
				+)
			
 
				+from torch.nn.utils.weight_norm import remove_weight_norm, weight_norm
			
 
				+
			
 
				+from seamless_communication.models.generator.ecapa_tdnn import ECAPA_TDNN
			
 
				+from seamless_communication.models.unity.fft_decoder import FeedForwardTransformer
			
 
				+from seamless_communication.models.unity.length_regulator import VarianceAdaptor
			
 
				+from seamless_communication.models.vocoder.hifigan import (
			
 
				+    LRELU_SLOPE,
			
 
				+    ResBlock,
			
 
				+    init_weights,
			
 
				+)
			
 
				+
			
 
				+from .streamable import (
			
 
				+    StreamableConv1d,
			
 
				+    StreamableConvTranspose1d,
			
 
				+    StreamableLSTM,
			
 
				+    StreamableResnetBlock,
			
 
				+)
			
 
				+
			
 
				+ELU_PARAMS: Dict[str, Any] = {"alpha": 1.0}
			
 
				+
			
 
				+
			
 
				+class PretsselEncoderFrontend(Module):
			
 
				+    """
			
 
				+    Represent Encoder frontend, including the prosody encoder and language embedding
			
 
				+    """
			
 
				+
			
 
				+    prosody_encoder: ECAPA_TDNN
			
 
				+    embed_tokens: Embedding
			
 
				+    embed_positions: PositionEncoder
			
 
				+    pos_emb_alpha: Parameter
			
 
				+    embed_lang: Embedding
			
 
				+    dropout: Dropout
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        prosody_encoder: ECAPA_TDNN,
			
 
				+        embed_tokens: Embedding,
			
 
				+        embed_positions: PositionEncoder,
			
 
				+        lang_to_index: Dict[str, int],
			
 
				+        lang_embed_dim: Optional[int],
			
 
				+        dropout_p: float,
			
 
				+        device: Optional[Device] = None,
			
 
				+        dtype: Optional[DataType] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.prosody_encoder = prosody_encoder
			
 
				+
			
 
				+        self.embed_tokens = embed_tokens
			
 
				+
			
 
				+        self.embed_positions = embed_positions
			
 
				+        self.pos_emb_alpha = Parameter(torch.ones(1, device=device, dtype=dtype))
			
 
				+
			
 
				+        self.lang_to_index = lang_to_index
			
 
				+
			
 
				+        if lang_embed_dim is not None:
			
 
				+            self.embed_lang = StandardEmbedding(
			
 
				+                len(lang_to_index), lang_embed_dim, device=device, dtype=dtype
			
 
				+            )
			
 
				+        else:
			
 
				+            self.register_module("embed_lang", None)
			
 
				+
			
 
				+        self.dropout = Dropout(dropout_p)
			
 
				+
			
 
				+        self.device = device
			
 
				+        self.dtype = dtype
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        seqs: torch.Tensor,
			
 
				+        padding_mask: Optional[PaddingMask],
			
 
				+        prosody_input_seqs: torch.Tensor,
			
 
				+        prosody_padding_mask: Optional[PaddingMask],
			
 
				+        tgt_lang: str,
			
 
				+    ) -> Tuple[torch.Tensor, torch.Tensor]:
			
 
				+        prosody_embs = self.prosody_encoder(
			
 
				+            prosody_input_seqs,
			
 
				+            prosody_padding_mask,
			
 
				+        ).unsqueeze(1)
			
 
				+
			
 
				+        if self.embed_lang is not None:
			
 
				+            lang_index = self.lang_to_index[tgt_lang]
			
 
				+            lang_index_tensor = (
			
 
				+                torch.Tensor([lang_index]).to(seqs).repeat(seqs.size(0), 1)
			
 
				+            )
			
 
				+            lang_embeds = self.embed_lang(lang_index_tensor)
			
 
				+            prosody_embs = torch.cat([prosody_embs, lang_embeds], dim=-1)
			
 
				+
			
 
				+        seqs = self.embed_tokens(seqs)
			
 
				+        seqs += self.pos_emb_alpha * (self.embed_positions(seqs, padding_mask) - seqs)
			
 
				+        seqs = self.dropout(seqs)
			
 
				+
			
 
				+        return seqs, prosody_embs
			
 
				+
			
 
				+
			
 
				+class PretsselDecoderFrontend(Module):
			
 
				+    """Represent Decoder frontend, including VarianceAdaptor & Positional embedding"""
			
 
				+
			
 
				+    variance_adaptor: VarianceAdaptor
			
 
				+    embed_positions: PositionEncoder
			
 
				+    pos_emb_alpha: Parameter
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        variance_adaptor: VarianceAdaptor,
			
 
				+        embed_positions: PositionEncoder,
			
 
				+        device: Optional[Device] = None,
			
 
				+        dtype: Optional[DataType] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.variance_adaptor = variance_adaptor
			
 
				+        self.embed_positions = embed_positions
			
 
				+        self.pos_emb_alpha = Parameter(torch.ones(1, device=device, dtype=dtype))
			
 
				+
			
 
				+        self.device = device
			
 
				+        self.dtype = dtype
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        seqs: torch.Tensor,
			
 
				+        padding_mask: PaddingMask,
			
 
				+        durations: Optional[torch.Tensor] = None,
			
 
				+        duration_factor: float = 1.0,
			
 
				+        min_duration: int = 0,
			
 
				+        film_cond_emb: Optional[torch.Tensor] = None,
			
 
				+    ) -> Tuple[torch.Tensor, PaddingMask]:
			
 
				+        seqs, padding_mask, _ = self.variance_adaptor(
			
 
				+            seqs, padding_mask, durations, duration_factor, min_duration, film_cond_emb
			
 
				+        )
			
 
				+
			
 
				+        seqs += self.pos_emb_alpha * (self.embed_positions(seqs, padding_mask) - seqs)
			
 
				+
			
 
				+        return seqs, padding_mask
			
 
				+
			
 
				+
			
 
				+class PretsselVocoder(Module):
			
 
				+    """The expressivity-preserving vocoder"""
			
 
				+
			
 
				+    encoder_frontend: PretsselEncoderFrontend
			
 
				+    encoder: FeedForwardTransformer
			
 
				+    decoder_frontend: PretsselDecoderFrontend
			
 
				+    decoder: FeedForwardTransformer
			
 
				+    final_proj: Projection
			
 
				+
			
 
				+    def __init__(  # type: ignore[no-untyped-def]
			
 
				+        self,
			
 
				+        encoder_frontend: PretsselEncoderFrontend,
			
 
				+        encoder: FeedForwardTransformer,
			
 
				+        decoder_frontend: PretsselDecoderFrontend,
			
 
				+        decoder: FeedForwardTransformer,
			
 
				+        final_proj: Projection,
			
 
				+        pn_n_channels: int,
			
 
				+        pn_kernel_size: int,
			
 
				+        pn_layers: int,
			
 
				+        pn_dropout: float,
			
 
				+        upsample_rates: List[int],
			
 
				+        upsample_kernel_sizes: List[int],
			
 
				+        upsample_initial_channel: int,
			
 
				+        resblock_kernel_sizes: List[int],
			
 
				+        resblock_dilation_sizes: List[List[int]],
			
 
				+        mel_dim: int = 80,
			
 
				+        add_ups_out_pad: bool = True,
			
 
				+        channels: int = 1,
			
 
				+        dimension: int = 128,
			
 
				+        n_filters: int = 32,
			
 
				+        ratios: List[int] = [8, 5, 4, 2],
			
 
				+        norm: Literal[
			
 
				+            "none", "weight_norm", "spectral_norm", "time_group_norm"
			
 
				+        ] = "none",
			
 
				+        norm_params: Dict[str, Any] = {},
			
 
				+        kernel_size: int = 7,
			
 
				+        last_kernel_size: int = 7,
			
 
				+        residual_kernel_size: int = 3,
			
 
				+        causal: bool = False,
			
 
				+        pad_mode: str = "constant",
			
 
				+        true_skip: bool = True,
			
 
				+        compress: int = 2,
			
 
				+        lstm: int = 0,
			
 
				+        disable_norm_outer_blocks: int = 0,
			
 
				+        trim_right_ratio: float = 1.0,
			
 
				+        gcmvn_mean: Optional[List[float]] = None,
			
 
				+        gcmvn_std: Optional[List[float]] = None,
			
 
				+        device: Optional[Device] = None,
			
 
				+        dtype: Optional[DataType] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.encoder_frontend = encoder_frontend
			
 
				+        self.encoder = encoder
			
 
				+        self.decoder_frontend = decoder_frontend
			
 
				+        self.decoder = decoder
			
 
				+        self.final_proj = final_proj
			
 
				+        mult = 1
			
 
				+        stream_layers: List[Module] = [
			
 
				+            StreamableConv1d(
			
 
				+                channels,
			
 
				+                mult * n_filters,
			
 
				+                kernel_size,
			
 
				+                norm="none" if disable_norm_outer_blocks >= 1 else norm,
			
 
				+                norm_kwargs=norm_params,
			
 
				+                causal=causal,
			
 
				+                pad_mode=pad_mode,
			
 
				+                activation=Tanh(),
			
 
				+                device=device,
			
 
				+                dtype=dtype,
			
 
				+            )
			
 
				+        ]
			
 
				+        # Downsample to from audio scale
			
 
				+        for i, ratio in enumerate(list(reversed(ratios))):
			
 
				+            block_norm = "none" if disable_norm_outer_blocks >= i + 2 else norm
			
 
				+            stream_layers.append(
			
 
				+                StreamableResnetBlock(
			
 
				+                    mult * n_filters,
			
 
				+                    kernel_sizes=[residual_kernel_size, 1],
			
 
				+                    dilations=[1, 1],
			
 
				+                    norm=block_norm,
			
 
				+                    norm_params=norm_params,
			
 
				+                    causal=causal,
			
 
				+                    pad_mode=pad_mode,
			
 
				+                    compress=compress,
			
 
				+                    true_skip=true_skip,
			
 
				+                    device=device,
			
 
				+                    dtype=dtype,
			
 
				+                )
			
 
				+            )
			
 
				+            stream_layers.append(ELU(**ELU_PARAMS))
			
 
				+            stream_layers.append(
			
 
				+                StreamableConv1d(
			
 
				+                    mult * n_filters,
			
 
				+                    mult * n_filters * 2,
			
 
				+                    kernel_size=ratio * 2,
			
 
				+                    stride=ratio,
			
 
				+                    norm=block_norm,
			
 
				+                    norm_kwargs=norm_params,
			
 
				+                    causal=causal,
			
 
				+                    pad_mode=pad_mode,
			
 
				+                    device=device,
			
 
				+                    dtype=dtype,
			
 
				+                )
			
 
				+            )
			
 
				+            mult *= 2
			
 
				+
			
 
				+        stream_layers.append(StreamableLSTM(mult * n_filters, num_layers=lstm))
			
 
				+        stream_layers.append(ELU(**ELU_PARAMS))
			
 
				+        n_blocks = len(ratios) + 2
			
 
				+        stream_layers.append(
			
 
				+            StreamableConv1d(
			
 
				+                mult * n_filters,
			
 
				+                dimension,
			
 
				+                last_kernel_size,
			
 
				+                norm="none" if disable_norm_outer_blocks == n_blocks else norm,
			
 
				+                norm_kwargs=norm_params,
			
 
				+                causal=causal,
			
 
				+                pad_mode=pad_mode,
			
 
				+                device=device,
			
 
				+                dtype=dtype,
			
 
				+            )
			
 
				+        )
			
 
				+        stream_layers.append(
			
 
				+            StreamableConv1d(
			
 
				+                dimension,
			
 
				+                mult * n_filters,
			
 
				+                kernel_size,
			
 
				+                norm="none" if disable_norm_outer_blocks == n_blocks else norm,
			
 
				+                norm_kwargs=norm_params,
			
 
				+                causal=causal,
			
 
				+                pad_mode=pad_mode,
			
 
				+                device=device,
			
 
				+                dtype=dtype,
			
 
				+            )
			
 
				+        )
			
 
				+        stream_layers.append(
			
 
				+            StreamableLSTM(
			
 
				+                mult * n_filters, num_layers=lstm, device=device, dtype=dtype
			
 
				+            )
			
 
				+        )
			
 
				+
			
 
				+        # resample back to raw audio scale
			
 
				+        for i, ratio in enumerate(ratios):
			
 
				+            block_norm = (
			
 
				+                "none" if disable_norm_outer_blocks >= n_blocks - (i + 1) else norm
			
 
				+            )
			
 
				+            stream_layers.append(ELU(**ELU_PARAMS))
			
 
				+            stream_layers.append(
			
 
				+                StreamableConvTranspose1d(
			
 
				+                    mult * n_filters,
			
 
				+                    mult * n_filters // 2,
			
 
				+                    kernel_size=ratio * 2,
			
 
				+                    stride=ratio,
			
 
				+                    norm=block_norm,
			
 
				+                    norm_kwargs=norm_params,
			
 
				+                    causal=causal,
			
 
				+                    trim_right_ratio=trim_right_ratio,
			
 
				+                    device=device,
			
 
				+                    dtype=dtype,
			
 
				+                )
			
 
				+            )
			
 
				+            stream_layers.append(
			
 
				+                StreamableResnetBlock(
			
 
				+                    mult * n_filters // 2,
			
 
				+                    kernel_sizes=[residual_kernel_size, 1],
			
 
				+                    dilations=[1, 1],
			
 
				+                    norm=block_norm,
			
 
				+                    norm_params=norm_params,
			
 
				+                    activation_params=ELU_PARAMS,
			
 
				+                    causal=causal,
			
 
				+                    pad_mode=pad_mode,
			
 
				+                    compress=compress,
			
 
				+                    true_skip=true_skip,
			
 
				+                    device=device,
			
 
				+                    dtype=dtype,
			
 
				+                )
			
 
				+            )
			
 
				+            mult //= 2
			
 
				+
			
 
				+        stream_layers.append(ELU(**ELU_PARAMS))
			
 
				+        stream_layers.append(
			
 
				+            StreamableConv1d(
			
 
				+                n_filters,
			
 
				+                channels,
			
 
				+                last_kernel_size,
			
 
				+                norm="none" if disable_norm_outer_blocks >= 1 else norm,
			
 
				+                norm_kwargs=norm_params,
			
 
				+                causal=causal,
			
 
				+                pad_mode=pad_mode,
			
 
				+                device=device,
			
 
				+                dtype=dtype,
			
 
				+            )
			
 
				+        )
			
 
				+        self.n_streams = len(stream_layers)
			
 
				+        chunk_size = self.n_streams // 4
			
 
				+        stream_idx = 0
			
 
				+
			
 
				+        self.pn_layers = pn_layers
			
 
				+        self.layers = ModuleList()
			
 
				+        assert pn_kernel_size % 2 == 1
			
 
				+        for i in range(pn_layers):
			
 
				+            cur_layers = (
			
 
				+                [
			
 
				+                    Conv1d(
			
 
				+                        mel_dim if i == 0 else pn_n_channels,
			
 
				+                        pn_n_channels if i < pn_layers - 1 else mel_dim,
			
 
				+                        kernel_size=pn_kernel_size,
			
 
				+                        padding="same",
			
 
				+                        device=device,
			
 
				+                        dtype=dtype,
			
 
				+                    ),
			
 
				+                    BatchNorm1d(
			
 
				+                        pn_n_channels if i < pn_layers - 1 else mel_dim,
			
 
				+                        device=device,
			
 
				+                        dtype=dtype,
			
 
				+                    ),
			
 
				+                ]
			
 
				+                + ([Tanh()] if i < pn_layers - 1 else [])
			
 
				+                + [Dropout(pn_dropout)]
			
 
				+            )
			
 
				+            self.layers.append(Sequential(*cur_layers))
			
 
				+        self.reset_parameters()
			
 
				+        self.layers.extend(stream_layers[:chunk_size])
			
 
				+        stream_idx += chunk_size
			
 
				+        self.layers.append(
			
 
				+            weight_norm(
			
 
				+                Conv1d(
			
 
				+                    mel_dim if mel_dim is not None else 80,
			
 
				+                    upsample_initial_channel,
			
 
				+                    7,
			
 
				+                    1,
			
 
				+                    padding="same",
			
 
				+                    device=device,
			
 
				+                    dtype=dtype,
			
 
				+                )
			
 
				+            )
			
 
				+        )
			
 
				+        self.layers.extend(stream_layers[stream_idx : stream_idx + chunk_size])  # noqa
			
 
				+        stream_idx += chunk_size
			
 
				+
			
 
				+        self.num_kernels = len(resblock_kernel_sizes)
			
 
				+        self.num_upsamples = len(upsample_rates)
			
 
				+        ups = ModuleList()
			
 
				+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
			
 
				+            out_pad = u % 2 if add_ups_out_pad else 0
			
 
				+            ups.append(
			
 
				+                weight_norm(
			
 
				+                    ConvTranspose1d(
			
 
				+                        upsample_initial_channel // (2**i),
			
 
				+                        upsample_initial_channel // (2 ** (i + 1)),
			
 
				+                        k,
			
 
				+                        u,
			
 
				+                        padding=(k - u) // 2 + out_pad,
			
 
				+                        output_padding=out_pad,
			
 
				+                        device=device,
			
 
				+                        dtype=dtype,
			
 
				+                    )
			
 
				+                )
			
 
				+            )
			
 
				+        ups.apply(init_weights)
			
 
				+        self.layers.extend(ups)
			
 
				+        self.layers.extend(stream_layers[stream_idx : stream_idx + chunk_size])  # noqa
			
 
				+        stream_idx += chunk_size
			
 
				+
			
 
				+        for i in range(self.num_upsamples):
			
 
				+            ch = upsample_initial_channel // (2 ** (i + 1))
			
 
				+            for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes):
			
 
				+                self.layers.append(
			
 
				+                    ResBlock(
			
 
				+                        ch,
			
 
				+                        k,
			
 
				+                        d,
			
 
				+                    ).to(device, dtype=dtype)
			
 
				+                )
			
 
				+        self.layers.extend(stream_layers[stream_idx:])
			
 
				+
			
 
				+        conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
			
 
				+        conv_post.apply(init_weights)
			
 
				+        self.layers.append(conv_post)
			
 
				+        for u, k in zip(upsample_rates, upsample_kernel_sizes):
			
 
				+            assert k == 2 * u, (k, u)
			
 
				+
			
 
				+        mean = torch.zeros((mel_dim,), dtype=torch.float)
			
 
				+        scale = torch.zeros((mel_dim,), dtype=torch.float)
			
 
				+        self.register_buffer("mean", mean)
			
 
				+        self.register_buffer("scale", scale)
			
 
				+
			
 
				+        self.gcmvn_mean = torch.tensor(gcmvn_mean, device=device, dtype=dtype)
			
 
				+        self.gcmvn_std = torch.tensor(gcmvn_std, device=device, dtype=dtype)
			
 
				+
			
 
				+    def reset_parameters(self) -> None:
			
 
				+        for i in range(self.pn_layers):
			
 
				+            init.xavier_uniform_(
			
 
				+                self.layers[i][0].weight,
			
 
				+                init.calculate_gain("tanh" if i < self.pn_layers - 1 else "linear"),
			
 
				+            )
			
 
				+
			
 
				+    def gcmvn_denormalize(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        if self.gcmvn_mean is None or self.gcmvn_std is None:
			
 
				+            raise ValueError("gcmvn_mean is not set")
			
 
				+
			
 
				+        assert (
			
 
				+            x.ndim == 3
			
 
				+            and x.shape[2] == self.gcmvn_mean.shape[0]
			
 
				+            and x.shape[2] == self.gcmvn_std.shape[0]
			
 
				+        )
			
 
				+        gcmvn_mean = self.gcmvn_mean.to(x)
			
 
				+        gcmvn_std = self.gcmvn_std.to(x)
			
 
				+        x = x * gcmvn_std.view(1, 1, -1).expand_as(x)  # type: ignore[attr-defined]
			
 
				+        return x + gcmvn_mean.view(1, 1, -1).expand_as(x)  # type: ignore[attr-defined,no-any-return]
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        seqs: torch.Tensor,
			
 
				+        tgt_lang: str,
			
 
				+        prosody_input_seqs: torch.Tensor,
			
 
				+        padding_mask: Optional[PaddingMask] = None,
			
 
				+        prosody_padding_mask: Optional[PaddingMask] = None,
			
 
				+        durations: Optional[torch.Tensor] = None,
			
 
				+        duration_factor: float = 1.0,
			
 
				+        min_duration: int = 0,
			
 
				+        normalize_before: bool = True,
			
 
				+    ) -> torch.Tensor:
			
 
				+        # Here we are adding batch dimension for the pretssel
			
 
				+        if seqs.ndim < 3:
			
 
				+            seqs = seqs.unsqueeze(0)
			
 
				+        if prosody_input_seqs.ndim < 3:
			
 
				+            prosody_input_seqs = prosody_input_seqs.unsqueeze(0)
			
 
				+        seqs, cond_embs = self.encoder_frontend(
			
 
				+            seqs,
			
 
				+            padding_mask,
			
 
				+            prosody_input_seqs,
			
 
				+            prosody_padding_mask,
			
 
				+            tgt_lang,
			
 
				+        )
			
 
				+        seqs, padding_mask = self.encoder(seqs, padding_mask, cond_embs)
			
 
				+        seqs, padding_mask = self.decoder_frontend(
			
 
				+            seqs, padding_mask, durations, duration_factor, min_duration, cond_embs
			
 
				+        )
			
 
				+        seqs, padding_mask = self.decoder(seqs, padding_mask, cond_embs)
			
 
				+        seqs = self.final_proj(seqs)
			
 
				+
			
 
				+        pn = seqs.transpose(1, 2)  # B x T x C -> B x C x T
			
 
				+        for i in range(self.pn_layers):
			
 
				+            pn = self.layers[i](pn)
			
 
				+        pn = pn.transpose(1, 2)
			
 
				+
			
 
				+        x = seqs + pn
			
 
				+        x = self.gcmvn_denormalize(x).squeeze(0)
			
 
				+        if normalize_before:
			
 
				+            x = (x - self.mean) / self.scale
			
 
				+
			
 
				+        x = x.transpose(1, 0).unsqueeze(0)
			
 
				+        chunk_size = self.n_streams // 4
			
 
				+        x = self.layers[self.pn_layers + chunk_size](x)
			
 
				+        for i in range(self.num_upsamples):
			
 
				+            x = F.leaky_relu(x, LRELU_SLOPE)
			
 
				+            x = self.layers[i + self.pn_layers + 1 + 2 * chunk_size](x)
			
 
				+            xs = None
			
 
				+            for j in range(self.num_kernels):
			
 
				+                if xs is None:
			
 
				+                    xs = self.layers[
			
 
				+                        i * self.num_kernels
			
 
				+                        + j
			
 
				+                        + self.pn_layers
			
 
				+                        + 3 * chunk_size
			
 
				+                        + self.num_upsamples
			
 
				+                        + 1
			
 
				+                    ](x)
			
 
				+                else:
			
 
				+                    xs += self.layers[
			
 
				+                        i * self.num_kernels
			
 
				+                        + j
			
 
				+                        + self.pn_layers
			
 
				+                        + 3 * chunk_size
			
 
				+                        + self.num_upsamples
			
 
				+                        + 1
			
 
				+                    ](x)
			
 
				+            x = xs / self.num_kernels  # type: ignore
			
 
				+        x = F.leaky_relu(x)
			
 
				+        x = self.layers[
			
 
				+            self.pn_layers
			
 
				+            + self.n_streams
			
 
				+            + self.num_upsamples * (1 + self.num_kernels)
			
 
				+            + 1
			
 
				+        ](x)
			
 
				+        skip_output = x
			
 
				+        h = skip_output
			
 
				+
			
 
				+        for i1 in range(self.pn_layers, self.pn_layers + chunk_size):
			
 
				+            h = self.layers[i1](h)
			
 
				+        i1 += 2
			
 
				+        for i2 in range(i1, i1 + chunk_size):
			
 
				+            h = self.layers[i2](h)
			
 
				+        i2 = i2 + self.num_upsamples + 1
			
 
				+
			
 
				+        for i3 in range(i2, i2 + chunk_size):
			
 
				+            h = self.layers[i3](h)
			
 
				+        i3 = i3 + (self.num_upsamples * self.num_kernels) + 1
			
 
				+        for i4 in range(i3, i3 + chunk_size):
			
 
				+            h = self.layers[i4](h)
			
 
				+        h = h[:, :, : x.size(-1)]
			
 
				+
			
 
				+        h += torch.tanh(skip_output).squeeze(0)
			
 
				+        return h
			
 
				+
			
 
				+    def remove_weight_norm(self) -> None:
			
 
				+        i = self.pn_layers + 1
			
 
				+        for j in range(self.num_upsamples):
			
 
				+            remove_weight_norm(self.layers[i + j])
			
 
				+        for k in range(self.num_upsamples * self.num_kernels):
			
 
				+            self.layers[i + j + k + 1].remove_weight_norm()
			
 
				+        remove_weight_norm(self.layers[self.pn_layers])
			
 
				+        remove_weight_norm(
			
 
				+            self.layers[
			
 
				+                self.pn_layers + 1 + self.num_upsamples * (1 + self.num_kernels)
			
 
				+            ]
			
 
				+        )
			
--- a/src/seamless_communication/models/unity/builder.py
+++ b/src/seamless_communication/models/unity/builder.py
@@ -26,7 +26,7 @@ from fairseq2.nn.transformer import (
 
				 from fairseq2.typing import DataType, Device, override
			
 
				 from torch.nn import GELU, ReLU
			
 
				 
			
 
				-from seamless_communication.models.pretssel import (
			
 
				+from seamless_communication.models.generator.ecapa_tdnn_builder import (
			
 
				     EcapaTDNNBuilder,
			
 
				     EcapaTDNNConfig,
			
 
				     ecapa_tdnn_archs,
			
--- a/src/seamless_communication/models/unity/loader.py
+++ b/src/seamless_communication/models/unity/loader.py
@@ -8,7 +8,7 @@ from typing import Any, Dict, List, Mapping, Tuple, Union
 
				 
			
 
				 import torch
			
 
				 from fairseq2.assets import AssetStore, asset_store, download_manager
			
 
				-from fairseq2.assets.card import AssetCard
			
 
				+from fairseq2.assets.card import AssetCard, AssetCardFieldNotFoundError
			
 
				 from fairseq2.models.nllb import NllbConfig
			
 
				 from fairseq2.models.nllb.loader import NllbTokenizerLoader
			
 
				 from fairseq2.models.utils import ConfigLoader, ModelLoader
			
@@ -459,7 +459,11 @@ class GcmvnStatsLoader:
 
				         else:
			
 
				             card = self.asset_store.retrieve_card(model_name_or_card)
			
 
				 
			
 
				-        gcmvn_stats: Dict[str, List[float]] = card.field("gcmvn_stats").as_(dict)
			
 
				+        try:
			
 
				+            gcmvn_stats: Dict[str, List[float]] = card.field("gcmvn_stats").as_(dict)
			
 
				+        except AssetCardFieldNotFoundError:
			
 
				+            model_override = card.field("model_config").as_(dict)
			
 
				+            gcmvn_stats = model_override["gcmvn_stats"]
			
 
				 
			
 
				         return gcmvn_stats["mean"], gcmvn_stats["std"]
			
 
				 
			
--- a/src/seamless_communication/models/unity/model.py
+++ b/src/seamless_communication/models/unity/model.py
@@ -19,7 +19,7 @@ from overrides import final as finaloverride
 
				 from torch import Tensor
			
 
				 from torch.nn import Module
			
 
				 
			
 
				-from seamless_communication.models.pretssel.ecapa_tdnn import ECAPA_TDNN
			
 
				+from seamless_communication.models.generator.ecapa_tdnn import ECAPA_TDNN
			
 
				 from seamless_communication.models.unity.fft_decoder import FeedForwardTransformer
			
 
				 from seamless_communication.models.unity.nar_decoder_frontend import NARDecoderFrontend
			
 
				 
			
--- a/tests/integration/models/test_watermarked_vocoder.py
+++ b/tests/integration/models/test_watermarked_vocoder.py
@@ -0,0 +1,152 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+import sys
			
 
				+from typing import cast, List, Final, Optional
			
 
				+from anyio import Path
			
 
				+import torch
			
 
				+from fairseq2.typing import Device
			
 
				+from fairseq2.data import Collater, SequenceData
			
 
				+from fairseq2.data.audio import AudioDecoderOutput
			
 
				+from torch.nn import Module
			
 
				+
			
 
				+from seamless_communication.inference.pretssel_generator import PretsselGenerator
			
 
				+from seamless_communication.models.unity.loader import load_gcmvn_stats
			
 
				+from seamless_communication.models.generator.loader import load_pretssel_vocoder_model
			
 
				+from tests.common import (
			
 
				+    assert_close,
			
 
				+    convert_to_collated_fbank,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+N_MEL_BINS = 80
			
 
				+
			
 
				+# fmt: off
			
 
				+REF_FRA_UNITS: Final = [8976, 6589, 6589, 5736, 7542, 6515, 1240, 8335, 2381, 1076, 1076, 3380, 4085, 8207, 7957, 4446, 2641, 2544, 5552, 5529, 6319, 2779, 2890, 2890, 3229, 3303, 9751, 1979, 664, 1859, 1302, 528, 1303, 9543, 5770, 3532, 1286, 1286, 1727, 9287, 5248, 5586, 594, 3385, 2613, 1717, 7529, 7634, 931, 1602, 4512, 850, 2748, 5056, 1086, 2320, 2320, 9320, 3223, 5592, 1122, 419, 24, 4126, 5200, 2712, 9549, 8676, 8676, 3443, 7598, 7598, 2200, 2745, 1215, 118, 3840, 2703, 1616, 8788, 1240, 3349, 4890, 2756, 166, 9574, 9773, 5887, 2516, 9332, 6092, 3377, 4334, 3127, 3127, 3127, 944, 3089, 5947, 6572, 6572, 7561, 4358, 4358, 4358, 8124, 5549, 9275, 82, 8830, 8830, 5949, 22, 6729, 6878, 3817, 1871, 6092, 1441, 3127, 3928, 8254, 7984, 1116, 2796, 1806, 3710, 797, 9269, 576, 576, 2020, 137, 6624, 3815, 8690, 3634, 6036, 3530, 8719, 3458, 138, 8745, 5233, 2235, 8580, 8580, 6831, 2709, 7136, 9693, 3437, 3437, 3238, 4368, 2321, 2321, 391, 391, 4976, 8622, 6722, 3864, 9113, 9113, 7222, 7222, 7937, 999, 1286, 1286, 7789, 9396, 9603, 6690, 5233, 2235, 618, 8830, 6954, 3668, 4302, 596, 1934, 2886, 2704, 9097, 4161, 458, 4147, 9245, 9245, 3127, 3127, 944, 9676, 9676, 3468, 270, 270, 4608, 5549, 4182, 102, 8568, 1286, 1286, 5087, 817, 4153, 207, 207, 3763, 6415, 5188, 6010, 554, 753, 9953, 5104, 3828, 1879, 995, 9683, 6932, 3644, 2683, 9335, 183, 5525, 7023, 9568, 6222, 6315, 676, 3443, 6971, 2084, 999, 1286, 1286, 9620, 9620, 1048, 5577, 9328, 4963, 1364, 8328, 4573, 4573, 7917, 7917, 560, 2020, 4923, 137, 9542, 5832, 9775, 4780, 9400, 2745, 2745, 8984, 628, 8834, 6932, 3817, 8312, 5393, 458, 4147, 9191, 2225, 2759, 8980, 2351, 193, 1476, 9347, 3063, 2076, 3641, 1614, 9832, 3554, 8197, 5589, 5589, 7306, 184, 1708, 2954, 2954, 3485, 3485, 7665, 8909, 5405, 3590, 3590, 3446, 6442, 6442, 2802, 5549, 3791]
			
 
				+# fmt: on
			
 
				+
			
 
				+
			
 
				+def load_watermarking_model() -> Optional[Module]:
			
 
				+    import importlib.util
			
 
				+
			
 
				+    # Run in CPU mode until pretssel inconsistent behavious is fixed
			
 
				+    device = Device("cpu")
			
 
				+    dtype = torch.float32
			
 
				+    wm_py_file = Path(__file__).parents[3] / "scripts/watermarking/watermarking.py"
			
 
				+    assert wm_py_file.is_file()
			
 
				+    wm_spec = importlib.util.spec_from_file_location("watermark.f1", wm_py_file)
			
 
				+    assert wm_spec, f"Module not found: {wm_py_file}"
			
 
				+    wm_py_module = importlib.util.module_from_spec(wm_spec)
			
 
				+    assert wm_py_module, f"Invalid Python module file: {wm_py_file}"
			
 
				+    sys.modules["watermark.f1"] = wm_py_module
			
 
				+    assert wm_spec.loader, f"Module cannot be loaded from {wm_py_file}"
			
 
				+    wm_spec.loader.exec_module(wm_py_module)
			
 
				+
			
 
				+    return cast(Module, wm_py_module.model_from_checkpoint(device=device, dtype=dtype))
			
 
				+
			
 
				+
			
 
				+def test_pretssel_vocoder_watermarking(
			
 
				+    example_rate16k_audio: AudioDecoderOutput,
			
 
				+) -> None:
			
 
				+    """
			
 
				+    Test that the watermarked pretssel vocoder generates the same output
			
 
				+    as the non-watermarked (pretssel_generator)
			
 
				+    """
			
 
				+    audio = example_rate16k_audio
			
 
				+
			
 
				+    # Run in CPU mode until pretssel inconsistent behavious is fixed
			
 
				+    device = Device("cpu")
			
 
				+    dtype = torch.float32
			
 
				+    audio["waveform"] = audio["waveform"].to(device, dtype=dtype)
			
 
				+    feat = convert_to_collated_fbank(audio, dtype=dtype)["seqs"][0]
			
 
				+    feat = feat.to(device, dtype=dtype)
			
 
				+    # Run the watermarked vocoding
			
 
				+    # TODO: Build a generator API for the watermarked vocoder
			
 
				+    vocoder = load_pretssel_vocoder_model(
			
 
				+        "vocoder_pretssel", device=device, dtype=dtype
			
 
				+    )
			
 
				+
			
 
				+    units = torch.tensor(REF_FRA_UNITS, device=device, dtype=torch.int64)
			
 
				+
			
 
				+    # adjust the control symbols for the embedding
			
 
				+    units += 4
			
 
				+
			
 
				+    # eos_idx = 2 in the VocabularyInfo setting for base pretssel_vocoder
			
 
				+    unit_eos_token = torch.tensor([2], device=device)
			
 
				+    units = torch.cat([units, unit_eos_token], dim=0)
			
 
				+    units, duration = torch.unique_consecutive(units, return_counts=True)
			
 
				+
			
 
				+    # adjust for the last eos token
			
 
				+    duration[-1] = 0
			
 
				+    duration *= 2
			
 
				+
			
 
				+    # bos_idx=0 in base VocabularyInfo
			
 
				+    duration_collate = Collater(pad_value=0)
			
 
				+    duration_seqs = duration_collate(duration)
			
 
				+
			
 
				+    with torch.no_grad():
			
 
				+        vocoder.eval()
			
 
				+        wav_wm = vocoder(
			
 
				+            seqs=units,
			
 
				+            tgt_lang="fra",
			
 
				+            prosody_input_seqs=feat,
			
 
				+            durations=duration_seqs["seqs"],
			
 
				+            normalize_before=True,
			
 
				+        )
			
 
				+
			
 
				+    # torchaudio.save("wm.wav", wav_wm.squeeze(0).float().cpu(), sample_rate=16000)
			
 
				+
			
 
				+    # Run the non-watermarked vocoder using pretssel generator
			
 
				+    gcmvn_mean, gcmvn_std = load_gcmvn_stats("pretssel_v1")
			
 
				+    gcmvn_mean = torch.tensor(gcmvn_mean, device=device, dtype=dtype)  # type: ignore[assignment]
			
 
				+    gcmvn_std = torch.tensor(gcmvn_std, device=device, dtype=dtype)  # type: ignore[assignment]
			
 
				+
			
 
				+    generator = PretsselGenerator(
			
 
				+        "seamless_expressivity",
			
 
				+        "vocoder_mel_24khz",
			
 
				+        "pretssel_v1",
			
 
				+        gcmvn_mean=gcmvn_mean,  # type: ignore[arg-type]
			
 
				+        gcmvn_std=gcmvn_std,  # type: ignore[arg-type]
			
 
				+        device=device,
			
 
				+        dtype=dtype,
			
 
				+    )
			
 
				+
			
 
				+    # PretsselGenerator expects a batch of units
			
 
				+    unit_list: List[List[int]] = [REF_FRA_UNITS]
			
 
				+    prosody_input_seqs = SequenceData(
			
 
				+        is_ragged=False,
			
 
				+        seqs=feat.unsqueeze(0),  # add batch dim
			
 
				+        seq_lens=torch.tensor([feat.size(0)]),
			
 
				+    )
			
 
				+    speech_output = generator.predict(
			
 
				+        unit_list,
			
 
				+        tgt_lang="fra",
			
 
				+        prosody_encoder_input=prosody_input_seqs,
			
 
				+    )
			
 
				+    wav = speech_output.audio_wavs[0].unsqueeze(0)
			
 
				+
			
 
				+    # torchaudio.save("mel.wav", wav.float().cpu(), sample_rate=16000)
			
 
				+
			
 
				+    # Run the watermark model separately after the PretsselGenerator
			
 
				+    watermarker = load_watermarking_model()
			
 
				+    wm = watermarker.get_watermark(wav)  # type: ignore
			
 
				+    wav_wm_hat = wav + wm
			
 
				+
			
 
				+    # Test that the watermark is detecte-able
			
 
				+    detection = watermarker.detect_watermark(wav_wm)  # type: ignore
			
 
				+    assert torch.all(detection[:, 1, :] > 0.5)
			
 
				+
			
 
				+    # Remove the batch and compare parity on the overlapping frames
			
 
				+    wav_wm = wav_wm.squeeze(0)
			
 
				+    wav_wm_hat = wav_wm_hat.squeeze(0)
			
 
				+
			
 
				+    nframes = min(wav_wm_hat.size(1), wav_wm.size(1))
			
 
				+    assert_close(
			
 
				+        wav_wm[:, :nframes],
			
 
				+        wav_wm_hat[:, :nframes],
			
 
				+        atol=0.0,
			
 
				+        rtol=5.0,
			
 
				+    )