1 năm trước cách đây · 05419775be
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,7 +12,7 @@ per-file-ignores = [
 
				 profile = "black"
			
 
				 
			
 
				 [tool.mypy]
			
 
				-disable_error_code = "type-abstract"
			
 
				+disable_error_code = "type-abstract,typeddict-unknown-key"
			
 
				 disallow_untyped_calls = false
			
 
				 disallow_untyped_decorators = false
			
 
				 ignore_missing_imports = true
			
--- a/src/seamless_communication/cards/seamless_expressivity.yaml
+++ b/src/seamless_communication/cards/seamless_expressivity.yaml
@@ -0,0 +1,51 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the BSD-style license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+name: seamless_expressivity
			
 
				+base: unity_nllb-100
			
 
				+model_arch: expressivity_v2
			
 
				+char_tokenizer: "file://checkpoint/krs/unity2/spm_char_lang38_tc.model"
			
 
				+checkpoint: "file://checkpoint/hygong/Expressivity/multilingual_models/m2m.clean.ecapa_tdnn2.dim512.all.all.lr5e-05.mk4k.config_t2_fbank_nosa_gcmvn_10k.rdrop0.ls0.2.uf3.wu5k.fp16.mem_fp16.seed1.dr0.1.ld0.2.mp0.3.cmp0.25.ma.ak8.as8.al1.ald0.0.dld0.0.ca.D24L.t2uE4L.t2uD4L.usesfilm.inj_dec.ngpu64/checkpoint_best_export.pt"
			
 
				+num_units: 10000
			
 
				+unit_langs:
			
 
				+  - arb
			
 
				+  - ben
			
 
				+  - cat
			
 
				+  - ces
			
 
				+  - cmn
			
 
				+  - cym
			
 
				+  - dan
			
 
				+  - deu
			
 
				+  - eng
			
 
				+  - est
			
 
				+  - fin
			
 
				+  - fra
			
 
				+  - hin
			
 
				+  - ind
			
 
				+  - ita
			
 
				+  - jpn
			
 
				+  - kan
			
 
				+  - kor
			
 
				+  - mlt
			
 
				+  - nld
			
 
				+  - pes
			
 
				+  - pol
			
 
				+  - por
			
 
				+  - ron
			
 
				+  - rus
			
 
				+  - slk
			
 
				+  - spa
			
 
				+  - swe
			
 
				+  - swh
			
 
				+  - tam
			
 
				+  - tel
			
 
				+  - tgl
			
 
				+  - tha
			
 
				+  - tur
			
 
				+  - ukr
			
 
				+  - urd
			
 
				+  - uzn
			
 
				+  - vie
			
--- a/src/seamless_communication/cli/eval_utils/compute_metrics.py
+++ b/src/seamless_communication/cli/eval_utils/compute_metrics.py
@@ -10,17 +10,16 @@ from typing import Tuple, Union
 
				 
			
 
				 import pandas as pd
			
 
				 import whisper
			
 
				-
			
 
				 from fairseq2.typing import Device
			
 
				 from jiwer import cer, wer
			
 
				 from sacrebleu.metrics.base import Score, Signature
			
 
				 from sacrebleu.metrics.bleu import BLEU
			
 
				 from sacrebleu.metrics.chrf import CHRF
			
 
				-from seamless_communication.cli.eval_utils.lang_mapping import LANG3_LANG2
			
 
				 from tqdm import tqdm
			
 
				 from whisper import Whisper
			
 
				 from whisper.normalizers import BasicTextNormalizer, EnglishTextNormalizer
			
 
				 
			
 
				+from seamless_communication.cli.eval_utils.lang_mapping import LANG3_LANG2
			
 
				 
			
 
				 logging.basicConfig(
			
 
				     level=logging.INFO,
			
--- a/src/seamless_communication/cli/expressivity/__init__.py
+++ b/src/seamless_communication/cli/expressivity/__init__.py
--- a/src/seamless_communication/cli/expressivity/evaluate/__init__.py
+++ b/src/seamless_communication/cli/expressivity/evaluate/__init__.py
--- a/src/seamless_communication/cli/expressivity/evaluate/evaluate.py
+++ b/src/seamless_communication/cli/expressivity/evaluate/evaluate.py
@@ -0,0 +1,423 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+import argparse
			
 
				+import contextlib
			
 
				+import logging
			
 
				+import subprocess
			
 
				+from dataclasses import dataclass
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, List, Optional, Tuple
			
 
				+
			
 
				+import numpy as np
			
 
				+import torch
			
 
				+import torchaudio
			
 
				+from fairseq2.data import Collater, CString, DataPipeline, FileMapper
			
 
				+from fairseq2.data.audio import (
			
 
				+    AudioDecoder,
			
 
				+    WaveformToFbankConverter,
			
 
				+    WaveformToFbankOutput,
			
 
				+)
			
 
				+from fairseq2.data.text import StrSplitter, TextTokenizer, read_text
			
 
				+from fairseq2.data.typing import PathLike, StringLike
			
 
				+from fairseq2.generation import SequenceGeneratorOptions
			
 
				+from fairseq2.typing import DataType, Device
			
 
				+from sacrebleu.metrics import BLEU  # type: ignore[attr-defined]
			
 
				+from torch import Tensor
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+from seamless_communication.cli.m4t.predict import (
			
 
				+    add_inference_arguments,
			
 
				+    set_generation_opts,
			
 
				+)
			
 
				+from seamless_communication.inference import BatchedSpeechOutput, Modality, Translator
			
 
				+from seamless_communication.models.unity import load_unity_text_tokenizer
			
 
				+
			
 
				+logging.basicConfig(
			
 
				+    level=logging.INFO,
			
 
				+    format="%(asctime)s %(levelname)s -- %(name)s: %(message)s",
			
 
				+)
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class EvalContext:
			
 
				+    task: str
			
 
				+    """String representing the task. Valid choices are
			
 
				+    "S2ST", "S2TT", "T2ST", "T2TT", "ASR"."""
			
 
				+
			
 
				+    output_modality: Modality
			
 
				+    """The output modality of the task."""
			
 
				+
			
 
				+    model_name: str
			
 
				+    """The name of the S2T UnitY model."""
			
 
				+
			
 
				+    data_file: Path
			
 
				+    """The pathname of the test TSV data file."""
			
 
				+
			
 
				+    audio_root_dir: Optional[Path]
			
 
				+    """The pathname of the directory under which
			
 
				+    audio files are stored."""
			
 
				+
			
 
				+    target_lang: str
			
 
				+    """The target translation language."""
			
 
				+
			
 
				+    source_lang: Optional[str]
			
 
				+    """The source language."""
			
 
				+
			
 
				+    batch_size: int
			
 
				+    """The batch size for model input."""
			
 
				+
			
 
				+    device: Device
			
 
				+    """The device on which to run inference."""
			
 
				+
			
 
				+    dtype: DataType
			
 
				+    """The data type with which to run inference."""
			
 
				+
			
 
				+    output_path: Path
			
 
				+    """The pathname of the output directory to save
			
 
				+    the evaluation results."""
			
 
				+
			
 
				+    ref_field: str
			
 
				+    """The reference target text field to compute
			
 
				+    the BLEU score against."""
			
 
				+
			
 
				+    text_generation_opts: SequenceGeneratorOptions
			
 
				+    """Text generation hyperparameters."""
			
 
				+
			
 
				+    unit_generation_opts: Optional[SequenceGeneratorOptions]
			
 
				+    """Unit generation hyperparameters, not applicable
			
 
				+    for the NAR T2U decoder."""
			
 
				+
			
 
				+    unit_generation_ngram_filtering: bool
			
 
				+    """If True, removes consecutive repeating ngrams
			
 
				+    from the decoded unit output."""
			
 
				+
			
 
				+    gcmvn_stats: Optional[PathLike] = None
			
 
				+    """the stats for gcmvn, used by Prosody Encoder"""
			
 
				+
			
 
				+
			
 
				+def count_lines(filename: Path) -> int:
			
 
				+    result = subprocess.run(["wc", "-l", filename], stdout=subprocess.PIPE)
			
 
				+    return int(result.stdout.decode().split()[0])
			
 
				+
			
 
				+
			
 
				+def build_data_pipeline(
			
 
				+    ctx: EvalContext,
			
 
				+    text_tokenizer: TextTokenizer,
			
 
				+) -> DataPipeline:
			
 
				+    with open(ctx.data_file, "r") as f:
			
 
				+        header = f.readline().strip("\n").split("\t")
			
 
				+
			
 
				+    # TODO: This will be soon auto-tuned. Right now hand-tuned for devfair.
			
 
				+    n_parallel = 4
			
 
				+
			
 
				+    split_tsv = StrSplitter(names=header)
			
 
				+
			
 
				+    if ctx.gcmvn_stats is not None:
			
 
				+        if isinstance(ctx.gcmvn_stats, CString):
			
 
				+            ctx.gcmvn_stats = str(ctx.gcmvn_stats)
			
 
				+        gcmvn_stats: Dict[str, np.ndarray] = np.load(ctx.gcmvn_stats)  # type: ignore[type-arg]
			
 
				+        gcmvn_mean = torch.tensor(
			
 
				+            gcmvn_stats["mean"], device=ctx.device, dtype=ctx.dtype
			
 
				+        )
			
 
				+        gcmvn_std = torch.tensor(gcmvn_stats["std"], device=ctx.device, dtype=ctx.dtype)
			
 
				+
			
 
				+    pipeline_builder = read_text(ctx.data_file, rtrim=True).skip(1).map(split_tsv)
			
 
				+
			
 
				+    assert ctx.audio_root_dir is not None
			
 
				+
			
 
				+    map_file = FileMapper(root_dir=ctx.audio_root_dir, cached_fd_count=10)
			
 
				+
			
 
				+    pipeline_builder.map(map_file, selector="audio", num_parallel_calls=n_parallel)
			
 
				+
			
 
				+    decode_audio = AudioDecoder(dtype=torch.float32, device=ctx.device)
			
 
				+
			
 
				+    convert_to_fbank = WaveformToFbankConverter(
			
 
				+        num_mel_bins=80,
			
 
				+        waveform_scale=2**15,
			
 
				+        channel_last=True,
			
 
				+        standardize=False,
			
 
				+        device=ctx.device,
			
 
				+        dtype=ctx.dtype,
			
 
				+    )
			
 
				+
			
 
				+    def normalize_fbank(data: WaveformToFbankOutput) -> WaveformToFbankOutput:
			
 
				+        fbank = data["fbank"]
			
 
				+        std, mean = torch.std_mean(fbank, dim=0)
			
 
				+        data["fbank"] = fbank.subtract(mean).divide(std)
			
 
				+        if ctx.gcmvn_stats is not None:
			
 
				+            data["gcmvn_fbank"] = fbank.subtract(gcmvn_mean).divide(gcmvn_std)
			
 
				+        return data
			
 
				+
			
 
				+    pipeline_builder.map(
			
 
				+        [decode_audio, convert_to_fbank, normalize_fbank],
			
 
				+        selector="audio.data",
			
 
				+        num_parallel_calls=n_parallel,
			
 
				+    )
			
 
				+
			
 
				+    pipeline_builder.bucket(bucket_size=ctx.batch_size)
			
 
				+
			
 
				+    collate = Collater(pad_value=0, pad_to_multiple=1)
			
 
				+
			
 
				+    pipeline_builder.map(collate, num_parallel_calls=n_parallel)
			
 
				+
			
 
				+    pipeline_builder.prefetch(4)
			
 
				+
			
 
				+    return pipeline_builder.and_return()
			
 
				+
			
 
				+
			
 
				+def adjust_output_for_corrupted_inputs(
			
 
				+    valid_sequences: Tensor,
			
 
				+    text_output: List[StringLike],
			
 
				+    speech_output: Optional[BatchedSpeechOutput],
			
 
				+) -> Tuple[List[StringLike], Optional[BatchedSpeechOutput]]:
			
 
				+    adjusted_text_output: List[StringLike] = []
			
 
				+    adjusted_speech_output: Optional[BatchedSpeechOutput] = None
			
 
				+
			
 
				+    if speech_output is not None:
			
 
				+        assert (
			
 
				+            len(text_output)
			
 
				+            == len(speech_output.units)
			
 
				+            == len(speech_output.audio_wavs)
			
 
				+        )
			
 
				+        adjusted_speech_output = BatchedSpeechOutput(units=[], audio_wavs=[])
			
 
				+
			
 
				+    batch_counter = 0
			
 
				+    for is_valid in valid_sequences:
			
 
				+        if is_valid:
			
 
				+            adjusted_text_output.append(text_output[batch_counter])
			
 
				+            if speech_output is not None:
			
 
				+                assert adjusted_speech_output is not None
			
 
				+                adjusted_speech_output.units.append(speech_output.units[batch_counter])
			
 
				+                adjusted_speech_output.audio_wavs.append(
			
 
				+                    speech_output.audio_wavs[batch_counter]
			
 
				+                )
			
 
				+            batch_counter += 1
			
 
				+        else:
			
 
				+            # For the corrupted inputs, we save the following dummy outputs:
			
 
				+            # empty string for text, empty list for units, 1 second of silence for audio.
			
 
				+            adjusted_text_output.append("")
			
 
				+            if adjusted_speech_output is not None:
			
 
				+                sample_rate = adjusted_speech_output.sample_rate
			
 
				+                adjusted_speech_output.units.append([])
			
 
				+                adjusted_speech_output.audio_wavs.append(
			
 
				+                    torch.zeros(sample_rate).unsqueeze(0).unsqueeze(0)
			
 
				+                )
			
 
				+    return (
			
 
				+        adjusted_text_output,
			
 
				+        adjusted_speech_output,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def run_eval(
			
 
				+    translator: Translator, text_tokenizer: TextTokenizer, ctx: EvalContext
			
 
				+) -> None:
			
 
				+    pipeline = build_data_pipeline(ctx, text_tokenizer)
			
 
				+
			
 
				+    total_steps = count_lines(ctx.data_file) - 1
			
 
				+    progress_bar = tqdm(total=total_steps)
			
 
				+
			
 
				+    output_path = ctx.output_path / ctx.data_file.stem
			
 
				+    output_path.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    if ctx.output_modality == Modality.SPEECH:
			
 
				+        waveforms_dir = output_path / f"waveform_{ctx.data_file.stem}"
			
 
				+        waveforms_dir.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    hyps = []
			
 
				+    refs = []
			
 
				+
			
 
				+    with contextlib.ExitStack() as stack:
			
 
				+        hyp_file = stack.enter_context(
			
 
				+            open(output_path / f"text_output-{ctx.data_file.stem}.txt", "w")
			
 
				+        )
			
 
				+        if ctx.output_modality == Modality.SPEECH:
			
 
				+            unit_file = stack.enter_context(
			
 
				+                open(output_path / f"unit_output-{ctx.data_file.stem}.txt", "w")
			
 
				+            )
			
 
				+
			
 
				+        sample_id = 0
			
 
				+        for example in pipeline:
			
 
				+            valid_sequences: Optional[Tensor] = None
			
 
				+            src = example["audio"]["data"]["fbank"]
			
 
				+            # Skip corrupted audio tensors.
			
 
				+            valid_sequences = ~torch.any(
			
 
				+                torch.any(torch.isnan(src["seqs"]), dim=1), dim=1
			
 
				+            )
			
 
				+            if not valid_sequences.all():
			
 
				+                logger.warning(
			
 
				+                    f"Sample IDs {sample_id} to {sample_id + ctx.batch_size} has some corrupted input."
			
 
				+                )
			
 
				+                src["seqs"] = src["seqs"][valid_sequences]
			
 
				+                src["seq_lens"] = src["seq_lens"][valid_sequences]
			
 
				+
			
 
				+            # Skip performing inference when the input is entirely corrupted.
			
 
				+            if src["seqs"].numel() > 0:
			
 
				+                (
			
 
				+                    text_output,
			
 
				+                    speech_output,
			
 
				+                ) = translator.predict(
			
 
				+                    src,
			
 
				+                    ctx.task,
			
 
				+                    ctx.target_lang,
			
 
				+                    src_lang=ctx.source_lang,
			
 
				+                    text_generation_opts=ctx.text_generation_opts,
			
 
				+                    unit_generation_opts=ctx.unit_generation_opts,
			
 
				+                    unit_generation_ngram_filtering=ctx.unit_generation_ngram_filtering,
			
 
				+                    gcmvn_fbank=example["audio"]["data"].get("gcmvn_fbank", None),
			
 
				+                )
			
 
				+            else:
			
 
				+                text_output = []
			
 
				+                if ctx.output_modality == Modality.SPEECH:
			
 
				+                    speech_output = BatchedSpeechOutput(units=[], audio_wavs=[])
			
 
				+                else:
			
 
				+                    speech_output = None
			
 
				+
			
 
				+            if valid_sequences is not None and not valid_sequences.all():
			
 
				+                (
			
 
				+                    text_output,
			
 
				+                    speech_output,
			
 
				+                ) = adjust_output_for_corrupted_inputs(
			
 
				+                    valid_sequences,
			
 
				+                    text_output,
			
 
				+                    speech_output,
			
 
				+                )
			
 
				+
			
 
				+            hyps += [str(s) for s in text_output]
			
 
				+            refs += [str(s) for s in example[ctx.ref_field]]
			
 
				+
			
 
				+            for i in range(len(text_output)):
			
 
				+                t = text_output[i]
			
 
				+                hyp_file.write(f"{t}\n")
			
 
				+
			
 
				+                if ctx.output_modality == Modality.SPEECH:
			
 
				+                    assert speech_output is not None
			
 
				+                    u = speech_output.units[i]
			
 
				+                    str_units = [str(i) for i in u]
			
 
				+                    unit_file.write(" ".join(str_units) + "\n")
			
 
				+                    torchaudio.save(
			
 
				+                        waveforms_dir / f"{sample_id}_pred.wav",
			
 
				+                        speech_output.audio_wavs[i][0].to(torch.float32).cpu(),
			
 
				+                        sample_rate=speech_output.sample_rate,
			
 
				+                    )
			
 
				+
			
 
				+                sample_id += 1
			
 
				+                progress_bar.update(1)
			
 
				+
			
 
				+    progress_bar.close()
			
 
				+    logger.info(f"Processed {len(hyps)} hyps, {len(refs)} refs")
			
 
				+
			
 
				+    assert len(hyps) == len(refs)
			
 
				+    if len(hyps) > 0:
			
 
				+        if ctx.target_lang in ("cmn", "jpn", "lao", "mya", "tha"):
			
 
				+            tokenizer = "char"
			
 
				+        else:
			
 
				+            tokenizer = "13a"
			
 
				+
			
 
				+        bleu = BLEU(tokenize=tokenizer)
			
 
				+        score = bleu.corpus_score(hyps, [refs])
			
 
				+        bleu_filename = output_path / f"{ctx.data_file.stem}_text_output_bleu.json"
			
 
				+        with open(bleu_filename, "w") as f:
			
 
				+            f.write(score.format(signature=str(bleu.get_signature()), is_json=True))
			
 
				+        logger.info(score.format(signature=bleu.get_signature()))
			
 
				+
			
 
				+
			
 
				+def main() -> None:
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description="Expressivity evaluation for tasks supported by Translator."
			
 
				+    )
			
 
				+    parser.add_argument("data_file", type=str, help="Data file (.tsv) to be evaluated.")
			
 
				+
			
 
				+    parser = add_inference_arguments(parser)
			
 
				+    parser.add_argument(
			
 
				+        "--batch_size",
			
 
				+        type=int,
			
 
				+        help="Inference batch size.",
			
 
				+        default=4,
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--audio_root_dir",
			
 
				+        type=str,
			
 
				+        help="Root directory for the audio filenames in the data file.",
			
 
				+        default="",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--ref_field",
			
 
				+        type=str,
			
 
				+        help="Reference target text field to compute the BLEU score against.",
			
 
				+        default="tgt_text",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--gcmvn_stats",
			
 
				+        type=str,
			
 
				+        help="The path to gcmvn fbank stats, if provided, the DataPipeline'd have another copy of gcmvn fbank features (for P2V enc)",
			
 
				+        default=None,
			
 
				+    )
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    input_modality, output_modality = Translator.get_modalities_from_task_str(args.task)
			
 
				+
			
 
				+    if input_modality == Modality.SPEECH and not Path(args.audio_root_dir).exists():
			
 
				+        raise ValueError(
			
 
				+            f"Invalid audio_root_dir: {args.audio_root_dir} for speech input."
			
 
				+        )
			
 
				+
			
 
				+    if torch.cuda.is_available():
			
 
				+        device = torch.device("cuda:0")
			
 
				+        dtype = torch.float32
			
 
				+    else:
			
 
				+        device = torch.device("cpu")
			
 
				+        dtype = torch.float32
			
 
				+
			
 
				+    text_tokenizer = load_unity_text_tokenizer(args.model_name)
			
 
				+
			
 
				+    # TODO: Avoid loading the T2U model, vocoder when the output
			
 
				+    # modality is text.
			
 
				+    translator = Translator(
			
 
				+        args.model_name,
			
 
				+        args.vocoder_name,
			
 
				+        device,
			
 
				+        text_tokenizer=text_tokenizer,
			
 
				+        dtype=dtype,
			
 
				+    )
			
 
				+
			
 
				+    text_generation_opts, unit_generation_opts = set_generation_opts(args)
			
 
				+
			
 
				+    logger.info(f"{text_generation_opts=}")
			
 
				+    logger.info(f"{unit_generation_opts=}")
			
 
				+    logger.info(
			
 
				+        f"unit_generation_ngram_filtering={args.unit_generation_ngram_filtering}"
			
 
				+    )
			
 
				+
			
 
				+    # fmt: off
			
 
				+    ctx = EvalContext(
			
 
				+        task=args.task,
			
 
				+        output_modality=output_modality,
			
 
				+        model_name=args.model_name,
			
 
				+        data_file=Path(args.data_file),
			
 
				+        audio_root_dir=Path(args.audio_root_dir),
			
 
				+        target_lang=args.tgt_lang,
			
 
				+        source_lang=args.src_lang,
			
 
				+        batch_size=args.batch_size,
			
 
				+        device=device,
			
 
				+        dtype=dtype,
			
 
				+        ref_field=args.ref_field,
			
 
				+        text_generation_opts=text_generation_opts,
			
 
				+        unit_generation_opts=unit_generation_opts,
			
 
				+        unit_generation_ngram_filtering=args.unit_generation_ngram_filtering,
			
 
				+        output_path=Path(args.output_path),
			
 
				+        gcmvn_stats=args.gcmvn_stats,
			
 
				+    )
			
 
				+    # fmt: on
			
 
				+    logger.info(f"Running inference on {device=} with {dtype=}, {ctx.batch_size=}.")
			
 
				+
			
 
				+    run_eval(translator, text_tokenizer, ctx)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/src/seamless_communication/cli/m4t/evaluate/evaluate.py
+++ b/src/seamless_communication/cli/m4t/evaluate/evaluate.py
@@ -267,7 +267,10 @@ def run_eval(
 
				 
			
 
				             # Skip performing inference when the input is entirely corrupted.
			
 
				             if src["seqs"].numel() > 0:
			
 
				-                (text_output, speech_output,) = translator.predict(
			
 
				+                (
			
 
				+                    text_output,
			
 
				+                    speech_output,
			
 
				+                ) = translator.predict(
			
 
				                     src,
			
 
				                     ctx.task,
			
 
				                     ctx.target_lang,
			
@@ -284,7 +287,10 @@ def run_eval(
 
				                     speech_output = None
			
 
				 
			
 
				             if valid_sequences is not None and not valid_sequences.all():
			
 
				-                (text_output, speech_output,) = adjust_output_for_corrupted_inputs(
			
 
				+                (
			
 
				+                    text_output,
			
 
				+                    speech_output,
			
 
				+                ) = adjust_output_for_corrupted_inputs(
			
 
				                     valid_sequences,
			
 
				                     text_output,
			
 
				                     speech_output,
			
--- a/src/seamless_communication/inference/generator.py
+++ b/src/seamless_communication/inference/generator.py
@@ -153,6 +153,7 @@ class UnitYGenerator:
 
				         input_modality: str = "speech",
			
 
				         output_modality: str = "speech",
			
 
				         ngram_filtering: bool = False,
			
 
				+        gcmvn_seqs: Optional[Tensor] = None,
			
 
				     ) -> Tuple[SequenceToTextOutput, Optional["SequenceToUnitOutput"]]:
			
 
				         """
			
 
				         :param source_seqs:
			
@@ -215,6 +216,12 @@ class UnitYGenerator:
 
				         assert self.unit_decoder is not None
			
 
				 
			
 
				         unit_gen_output = None
			
 
				+        prosody_encoder_out = None
			
 
				+        if self.model.prosody_encoder_model is not None:
			
 
				+            prosody_encoder_out = self.model.prosody_encoder_model(
			
 
				+                gcmvn_seqs, source_padding_mask
			
 
				+            ).unsqueeze(1)
			
 
				+
			
 
				         if isinstance(self.model.t2u_model, UnitYT2UModel):
			
 
				             assert self.unit_generator is not None
			
 
				             t2u_encoder_output, t2u_encoder_padding_mask = self.model.t2u_model.encode(
			
@@ -231,6 +238,7 @@ class UnitYGenerator:
 
				                 text_decoder_output=decoder_output,
			
 
				                 text_decoder_padding_mask=decoder_padding_mask,
			
 
				                 text_seqs=text_seqs,
			
 
				+                film_cond_emb=prosody_encoder_out,
			
 
				             )
			
 
				             # (B, S_unit, V_unit)
			
 
				             unit_seqs = unit_decoder_output.logits.argmax(dim=2)
			
@@ -243,8 +251,8 @@ class UnitYGenerator:
 
				         units = self.unit_decoder(unit_seqs)
			
 
				 
			
 
				         if ngram_filtering:
			
 
				-            units = remove_consecutive_repeated_ngrams(units.cpu().numpy().tolist())
			
 
				-            units = torch.tensor(units)
			
 
				+            arr = remove_consecutive_repeated_ngrams(units.cpu().numpy().tolist())
			
 
				+            units = torch.tensor(arr)
			
 
				 
			
 
				         unit_output = SequenceToUnitOutput(units, unit_gen_output)
			
 
				 
			
--- a/src/seamless_communication/inference/translator.py
+++ b/src/seamless_communication/inference/translator.py
@@ -7,7 +7,7 @@ import logging
 
				 from dataclasses import dataclass
			
 
				 from enum import Enum, auto
			
 
				 from pathlib import Path
			
 
				-from typing import Any, Dict, Callable, List, Optional, Tuple, Union, cast
			
 
				+from typing import Callable, List, Optional, Tuple, Union, cast
			
 
				 
			
 
				 import torch
			
 
				 import torch.nn as nn
			
@@ -144,6 +144,7 @@ class Translator(nn.Module):
 
				         text_generation_opts: SequenceGeneratorOptions,
			
 
				         unit_generation_opts: Optional[SequenceGeneratorOptions],
			
 
				         unit_generation_ngram_filtering: bool = False,
			
 
				+        gcmvn_fbank: Optional[SequenceData] = None,
			
 
				     ) -> Tuple[SequenceToTextOutput, Optional[SequenceToUnitOutput]]:
			
 
				         # We disregard unit generations opts for the NAR T2U decoder.
			
 
				         if output_modality != Modality.SPEECH or isinstance(
			
@@ -160,12 +161,18 @@ class Translator(nn.Module):
 
				             unit_opts=unit_generation_opts,
			
 
				         )
			
 
				         seqs, padding_mask = get_seqs_and_padding_mask(src)
			
 
				+        if gcmvn_fbank is not None:
			
 
				+            gcmvn_seqs = gcmvn_fbank["seqs"]
			
 
				+        else:
			
 
				+            gcmvn_seqs = None
			
 
				+
			
 
				         return generator(
			
 
				             seqs,
			
 
				             padding_mask,
			
 
				             input_modality.value,
			
 
				             output_modality.value,
			
 
				             ngram_filtering=unit_generation_ngram_filtering,
			
 
				+            gcmvn_seqs=gcmvn_seqs,
			
 
				         )
			
 
				 
			
 
				     @staticmethod
			
@@ -188,7 +195,7 @@ class Translator(nn.Module):
 
				     @torch.inference_mode()
			
 
				     def predict(
			
 
				         self,
			
 
				-        input: Union[str, Tensor, Dict[str, Any]],
			
 
				+        input: Union[str, Tensor, SequenceData],
			
 
				         task_str: str,
			
 
				         tgt_lang: str,
			
 
				         src_lang: Optional[str] = None,
			
@@ -201,6 +208,7 @@ class Translator(nn.Module):
 
				         spkr: Optional[int] = -1,
			
 
				         sample_rate: int = 16000,
			
 
				         unit_generation_ngram_filtering: bool = False,
			
 
				+        gcmvn_fbank: Optional[SequenceData] = None,
			
 
				     ) -> Tuple[List[StringLike], Optional[BatchedSpeechOutput]]:
			
 
				         """
			
 
				         The main method used to perform inference on all tasks.
			
@@ -231,8 +239,6 @@ class Translator(nn.Module):
 
				         input_modality, output_modality = self.get_modalities_from_task_str(task_str)
			
 
				 
			
 
				         if isinstance(input, dict):
			
 
				-            assert "seqs" in input
			
 
				-            assert "seq_lens" in input
			
 
				             src = cast(SequenceData, input)
			
 
				         elif input_modality == Modality.SPEECH:
			
 
				             audio = input
			
@@ -282,6 +288,7 @@ class Translator(nn.Module):
 
				             text_generation_opts,
			
 
				             unit_generation_opts,
			
 
				             unit_generation_ngram_filtering=unit_generation_ngram_filtering,
			
 
				+            gcmvn_fbank=gcmvn_fbank,
			
 
				         )
			
 
				 
			
 
				         if output_modality == Modality.TEXT:
			
--- a/src/seamless_communication/models/pretssel/__init__.py
+++ b/src/seamless_communication/models/pretssel/__init__.py
@@ -0,0 +1,16 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+from seamless_communication.models.pretssel.ecapa_tdnn import ECAPA_TDNN as ECAPA_TDNN
			
 
				+from seamless_communication.models.pretssel.ecapa_tdnn_builder import (
			
 
				+    EcapaTDNNBuilder as EcapaTDNNBuilder,
			
 
				+)
			
 
				+from seamless_communication.models.pretssel.ecapa_tdnn_builder import (
			
 
				+    EcapaTDNNConfig as EcapaTDNNConfig,
			
 
				+)
			
 
				+from seamless_communication.models.pretssel.ecapa_tdnn_builder import (
			
 
				+    ecapa_tdnn_archs as ecapa_tdnn_archs,
			
 
				+)
			
--- a/src/seamless_communication/models/pretssel/ecapa_tdnn.py
+++ b/src/seamless_communication/models/pretssel/ecapa_tdnn.py
@@ -0,0 +1,477 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+from typing import List, Optional, Tuple
			
 
				+
			
 
				+import torch
			
 
				+import torch.nn.functional as F
			
 
				+from fairseq2.nn.padding import PaddingMask, to_padding_mask
			
 
				+from torch import Tensor
			
 
				+from torch.nn import Conv1d, LayerNorm, Module, ModuleList, ReLU, Sigmoid, Tanh, init
			
 
				+
			
 
				+
			
 
				+class ECAPA_TDNN(Module):
			
 
				+    """
			
 
				+    Represents the ECAPA-TDNN model described in paper:
			
 
				+    :cite:t`https://doi.org/10.48550/arxiv.2005.07143`.
			
 
				+
			
 
				+    Arguments
			
 
				+    ---------
			
 
				+    :param channels:
			
 
				+        Output channels for TDNN/SERes2Net layer.
			
 
				+    :param kernel_sizes:
			
 
				+        List of kernel sizes for each layer.
			
 
				+    :param dilations:
			
 
				+        List of dilations for kernels in each layer.
			
 
				+    :param groups:
			
 
				+        List of groups for kernels in each layer.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        channels: List[int],
			
 
				+        kernel_sizes: List[int],
			
 
				+        dilations: List[int],
			
 
				+        attention_channels: int,
			
 
				+        res2net_scale: int,
			
 
				+        se_channels: int,
			
 
				+        global_context: bool,
			
 
				+        groups: List[int],
			
 
				+        embed_dim: int,
			
 
				+        input_dim: int,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        assert len(channels) == len(kernel_sizes) == len(dilations)
			
 
				+        self.channels = channels
			
 
				+        self.embed_dim = embed_dim
			
 
				+        self.blocks = ModuleList()
			
 
				+
			
 
				+        self.blocks.append(
			
 
				+            TDNNBlock(
			
 
				+                input_dim,
			
 
				+                channels[0],
			
 
				+                kernel_sizes[0],
			
 
				+                dilations[0],
			
 
				+                groups[0],
			
 
				+            )
			
 
				+        )
			
 
				+
			
 
				+        # SE-Res2Net layers
			
 
				+        for i in range(1, len(channels) - 1):
			
 
				+            self.blocks.append(
			
 
				+                SERes2NetBlock(
			
 
				+                    channels[i - 1],
			
 
				+                    channels[i],
			
 
				+                    res2net_scale=res2net_scale,
			
 
				+                    se_channels=se_channels,
			
 
				+                    kernel_size=kernel_sizes[i],
			
 
				+                    dilation=dilations[i],
			
 
				+                    groups=groups[i],
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				+        # Multi-layer feature aggregation
			
 
				+        self.mfa = TDNNBlock(
			
 
				+            channels[-1],
			
 
				+            channels[-1],
			
 
				+            kernel_sizes[-1],
			
 
				+            dilations[-1],
			
 
				+            groups=groups[-1],
			
 
				+        )
			
 
				+
			
 
				+        # Attentive Statistical Pooling
			
 
				+        self.asp = AttentiveStatisticsPooling(
			
 
				+            channels[-1],
			
 
				+            attention_channels=attention_channels,
			
 
				+            global_context=global_context,
			
 
				+        )
			
 
				+        self.asp_norm = LayerNorm(channels[-1] * 2, eps=1e-12)
			
 
				+
			
 
				+        # Final linear transformation
			
 
				+        self.fc = Conv1d(
			
 
				+            in_channels=channels[-1] * 2,
			
 
				+            out_channels=embed_dim,
			
 
				+            kernel_size=1,
			
 
				+        )
			
 
				+
			
 
				+        self.reset_parameters()
			
 
				+
			
 
				+    def reset_parameters(self) -> None:
			
 
				+        """Reset the parameters and buffers of the module."""
			
 
				+
			
 
				+        def encoder_init(m: Module) -> None:
			
 
				+            if isinstance(m, Conv1d):
			
 
				+                init.xavier_uniform_(m.weight, init.calculate_gain("relu"))
			
 
				+
			
 
				+        self.apply(encoder_init)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        x: Tensor,
			
 
				+        padding_mask: Optional[PaddingMask] = None,
			
 
				+    ) -> Tensor:
			
 
				+        """Returns the embedding vector.
			
 
				+
			
 
				+        Arguments
			
 
				+        ---------
			
 
				+        x : torch.Tensor
			
 
				+            Tensor of shape (batch, time, channel).
			
 
				+        """
			
 
				+        # Minimize transpose for efficiency
			
 
				+        x = x.transpose(1, 2)
			
 
				+
			
 
				+        xl = []
			
 
				+        for layer in self.blocks:
			
 
				+            x = layer(x, padding_mask=padding_mask)
			
 
				+            xl.append(x)
			
 
				+
			
 
				+        # Multi-layer feature aggregation
			
 
				+        x = torch.cat(xl[1:], dim=1)
			
 
				+        x = self.mfa(x)
			
 
				+
			
 
				+        # Attentive Statistical Pooling
			
 
				+        x = self.asp(x, padding_mask=padding_mask)
			
 
				+        x = self.asp_norm(x.transpose(1, 2)).transpose(1, 2)
			
 
				+
			
 
				+        # Final linear transformation
			
 
				+        x = self.fc(x)
			
 
				+
			
 
				+        x = x.transpose(1, 2).squeeze(1)  # B x C
			
 
				+        return F.normalize(x, dim=-1)
			
 
				+
			
 
				+
			
 
				+class TDNNBlock(Module):
			
 
				+    """An implementation of TDNN.
			
 
				+
			
 
				+    Arguments
			
 
				+    ----------
			
 
				+    :param in_channels : int
			
 
				+        Number of input channels.
			
 
				+    :param out_channels : int
			
 
				+        The number of output channels.
			
 
				+    :param kernel_size : int
			
 
				+        The kernel size of the TDNN blocks.
			
 
				+    :param dilation : int
			
 
				+        The dilation of the TDNN block.
			
 
				+    :param groups: int
			
 
				+        The groups size of the TDNN blocks.
			
 
				+
			
 
				+    Example
			
 
				+    -------
			
 
				+    >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
			
 
				+    >>> layer = TDNNBlock(64, 64, kernel_size=3, dilation=1)
			
 
				+    >>> out_tensor = layer(inp_tensor).transpose(1, 2)
			
 
				+    >>> out_tensor.shape
			
 
				+    torch.Size([8, 120, 64])
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        in_channels: int,
			
 
				+        out_channels: int,
			
 
				+        kernel_size: int,
			
 
				+        dilation: int,
			
 
				+        groups: int = 1,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.conv = Conv1d(
			
 
				+            in_channels=in_channels,
			
 
				+            out_channels=out_channels,
			
 
				+            kernel_size=kernel_size,
			
 
				+            dilation=dilation,
			
 
				+            padding=dilation * (kernel_size - 1) // 2,
			
 
				+            groups=groups,
			
 
				+        )
			
 
				+        self.activation = ReLU()
			
 
				+        self.norm = LayerNorm(out_channels, eps=1e-12)
			
 
				+
			
 
				+    def forward(self, x: Tensor, padding_mask: Optional[PaddingMask] = None) -> Tensor:
			
 
				+        """Processes the input tensor x and returns an output tensor."""
			
 
				+        x = self.activation(self.conv(x))
			
 
				+
			
 
				+        return self.norm(x.transpose(1, 2)).transpose(1, 2)  # type: ignore[no-any-return]
			
 
				+
			
 
				+
			
 
				+class Res2NetBlock(Module):
			
 
				+    """An implementation of Res2NetBlock w/ dilation.
			
 
				+
			
 
				+    Arguments
			
 
				+    ---------
			
 
				+    :param in_channels : int
			
 
				+        The number of channels expected in the input.
			
 
				+    :param out_channels : int
			
 
				+        The number of output channels.
			
 
				+    :param scale : int
			
 
				+        The scale of the Res2Net block.
			
 
				+    :param kernel_size: int
			
 
				+        The kernel size of the Res2Net block.
			
 
				+    :param dilation : int
			
 
				+        The dilation of the Res2Net block.
			
 
				+
			
 
				+    Example
			
 
				+    -------
			
 
				+    >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
			
 
				+    >>> layer = Res2NetBlock(64, 64, scale=4, dilation=3)
			
 
				+    >>> out_tensor = layer(inp_tensor).transpose(1, 2)
			
 
				+    >>> out_tensor.shape
			
 
				+    torch.Size([8, 120, 64])
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        in_channels: int,
			
 
				+        out_channels: int,
			
 
				+        scale: int = 8,
			
 
				+        kernel_size: int = 3,
			
 
				+        dilation: int = 1,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        assert in_channels % scale == 0
			
 
				+        assert out_channels % scale == 0
			
 
				+
			
 
				+        in_channel = in_channels // scale
			
 
				+        hidden_channel = out_channels // scale
			
 
				+        self.blocks = ModuleList(
			
 
				+            [
			
 
				+                TDNNBlock(
			
 
				+                    in_channel,
			
 
				+                    hidden_channel,
			
 
				+                    kernel_size=kernel_size,
			
 
				+                    dilation=dilation,
			
 
				+                )
			
 
				+                for i in range(scale - 1)
			
 
				+            ]
			
 
				+        )
			
 
				+        self.scale = scale
			
 
				+
			
 
				+    def forward(self, x: Tensor) -> Tensor:
			
 
				+        """Processes the input tensor x and returns an output tensor."""
			
 
				+        y = []
			
 
				+        for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)):
			
 
				+            if i == 0:
			
 
				+                y_i = x_i
			
 
				+            elif i == 1:
			
 
				+                y_i = self.blocks[i - 1](x_i)
			
 
				+            else:
			
 
				+                y_i = self.blocks[i - 1](x_i + y_i)
			
 
				+            y.append(y_i)
			
 
				+
			
 
				+        y_tensor = torch.cat(y, dim=1)
			
 
				+        return y_tensor
			
 
				+
			
 
				+
			
 
				+class SEBlock(Module):
			
 
				+    """An implementation of squeeze-and-excitation block.
			
 
				+
			
 
				+    Arguments
			
 
				+    ---------
			
 
				+    in_channels : int
			
 
				+        The number of input channels.
			
 
				+    se_channels : int
			
 
				+        The number of output channels after squeeze.
			
 
				+    out_channels : int
			
 
				+        The number of output channels.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        in_channels: int,
			
 
				+        se_channels: int,
			
 
				+        out_channels: int,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.conv1 = Conv1d(
			
 
				+            in_channels=in_channels, out_channels=se_channels, kernel_size=1
			
 
				+        )
			
 
				+        self.relu = ReLU(inplace=True)
			
 
				+        self.conv2 = Conv1d(
			
 
				+            in_channels=se_channels, out_channels=out_channels, kernel_size=1
			
 
				+        )
			
 
				+        self.sigmoid = Sigmoid()
			
 
				+
			
 
				+    def forward(self, x: Tensor, padding_mask: Optional[PaddingMask] = None) -> Tensor:
			
 
				+        """Processes the input tensor x and returns an output tensor."""
			
 
				+        if padding_mask is not None:
			
 
				+            mask = padding_mask.materialize().unsqueeze(1)
			
 
				+            s = (x * mask).sum(dim=2, keepdim=True) / padding_mask.seq_lens[
			
 
				+                :, None, None
			
 
				+            ]
			
 
				+        else:
			
 
				+            s = x.mean(dim=2, keepdim=True)
			
 
				+
			
 
				+        s = self.relu(self.conv1(s))
			
 
				+        s = self.sigmoid(self.conv2(s))
			
 
				+
			
 
				+        return s * x
			
 
				+
			
 
				+
			
 
				+class AttentiveStatisticsPooling(Module):
			
 
				+    """This class implements an attentive statistic pooling layer for each channel.
			
 
				+    It returns the concatenated mean and std of the input tensor.
			
 
				+
			
 
				+    Arguments
			
 
				+    ---------
			
 
				+    channels: int
			
 
				+        The number of input channels.
			
 
				+    attention_channels: int
			
 
				+        The number of attention channels.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self, channels: int, attention_channels: int = 128, global_context: bool = True
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.eps = 1e-12
			
 
				+        self.global_context = global_context
			
 
				+        if global_context:
			
 
				+            self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1)
			
 
				+        else:
			
 
				+            self.tdnn = TDNNBlock(channels, attention_channels, 1, 1)
			
 
				+
			
 
				+        self.tanh = Tanh()
			
 
				+        self.conv = Conv1d(
			
 
				+            in_channels=attention_channels, out_channels=channels, kernel_size=1
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, x: Tensor, padding_mask: Optional[PaddingMask] = None) -> Tensor:
			
 
				+        """Calculates mean and std for a batch (input tensor).
			
 
				+
			
 
				+        Arguments
			
 
				+        ---------
			
 
				+        x : torch.Tensor
			
 
				+            Tensor of shape [N, C, L].
			
 
				+        """
			
 
				+        L = x.shape[-1]
			
 
				+
			
 
				+        def _compute_statistics(
			
 
				+            x: Tensor, m: Tensor, dim: int = 2, eps: float = self.eps
			
 
				+        ) -> Tuple[Tensor, Tensor]:
			
 
				+            mean = (m * x).sum(dim)
			
 
				+            std = torch.sqrt((m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps))
			
 
				+            return mean, std
			
 
				+
			
 
				+        # if lengths is None:
			
 
				+        #     lengths = [x.shape[0]]
			
 
				+
			
 
				+        # Make binary mask of shape [N, 1, L]
			
 
				+        # mask = to_padding_mask(lengths, max(lengths))
			
 
				+        if padding_mask is not None:
			
 
				+            mask = padding_mask.materialize()
			
 
				+        else:
			
 
				+            mask = to_padding_mask(torch.IntTensor([L]), L).repeat(x.shape[0], 1).to(x)
			
 
				+        mask = mask.unsqueeze(1)
			
 
				+
			
 
				+        # Expand the temporal context of the pooling layer by allowing the
			
 
				+        # self-attention to look at global properties of the utterance.
			
 
				+        if self.global_context:
			
 
				+            # torch.std is unstable for backward computation
			
 
				+            # https://github.com/pytorch/pytorch/issues/4320
			
 
				+            total = mask.sum(dim=2, keepdim=True).to(x)
			
 
				+            mean, std = _compute_statistics(x, mask / total)
			
 
				+            mean = mean.unsqueeze(2).repeat(1, 1, L)
			
 
				+            std = std.unsqueeze(2).repeat(1, 1, L)
			
 
				+            attn = torch.cat([x, mean, std], dim=1)
			
 
				+        else:
			
 
				+            attn = x
			
 
				+
			
 
				+        # Apply layers
			
 
				+        attn = self.conv(self.tanh(self.tdnn(attn)))
			
 
				+
			
 
				+        # Filter out zero-paddings
			
 
				+        attn = attn.masked_fill(mask == 0, float("-inf"))
			
 
				+
			
 
				+        attn = F.softmax(attn, dim=2)
			
 
				+        mean, std = _compute_statistics(x, attn)
			
 
				+        # Append mean and std of the batch
			
 
				+        pooled_stats = torch.cat((mean, std), dim=1)
			
 
				+        pooled_stats = pooled_stats.unsqueeze(2)
			
 
				+
			
 
				+        return pooled_stats
			
 
				+
			
 
				+
			
 
				+class SERes2NetBlock(Module):
			
 
				+    """An implementation of building block in ECAPA-TDNN, i.e.,
			
 
				+    TDNN-Res2Net-TDNN-SEBlock.
			
 
				+
			
 
				+    Arguments
			
 
				+    ----------
			
 
				+    out_channels: int
			
 
				+        The number of output channels.
			
 
				+    res2net_scale: int
			
 
				+        The scale of the Res2Net block.
			
 
				+    kernel_size: int
			
 
				+        The kernel size of the TDNN blocks.
			
 
				+    dilation: int
			
 
				+        The dilation of the Res2Net block.
			
 
				+    groups: int
			
 
				+    Number of blocked connections from input channels to output channels.
			
 
				+
			
 
				+    Example
			
 
				+    -------
			
 
				+    >>> x = torch.rand(8, 120, 64).transpose(1, 2)
			
 
				+    >>> conv = SERes2NetBlock(64, 64, res2net_scale=4)
			
 
				+    >>> out = conv(x).transpose(1, 2)
			
 
				+    >>> out.shape
			
 
				+    torch.Size([8, 120, 64])
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        in_channels: int,
			
 
				+        out_channels: int,
			
 
				+        res2net_scale: int = 8,
			
 
				+        se_channels: int = 128,
			
 
				+        kernel_size: int = 1,
			
 
				+        dilation: int = 1,
			
 
				+        groups: int = 1,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.out_channels = out_channels
			
 
				+        self.tdnn1 = TDNNBlock(
			
 
				+            in_channels,
			
 
				+            out_channels,
			
 
				+            kernel_size=1,
			
 
				+            dilation=1,
			
 
				+            groups=groups,
			
 
				+        )
			
 
				+        self.res2net_block = Res2NetBlock(
			
 
				+            out_channels,
			
 
				+            out_channels,
			
 
				+            res2net_scale,
			
 
				+            kernel_size,
			
 
				+            dilation,
			
 
				+        )
			
 
				+        self.tdnn2 = TDNNBlock(
			
 
				+            out_channels,
			
 
				+            out_channels,
			
 
				+            kernel_size=1,
			
 
				+            dilation=1,
			
 
				+            groups=groups,
			
 
				+        )
			
 
				+        self.se_block = SEBlock(out_channels, se_channels, out_channels)
			
 
				+
			
 
				+        self.shortcut = None
			
 
				+        if in_channels != out_channels:
			
 
				+            self.shortcut = Conv1d(
			
 
				+                in_channels=in_channels,
			
 
				+                out_channels=out_channels,
			
 
				+                kernel_size=1,
			
 
				+            )
			
 
				+
			
 
				+    def forward(self, x: Tensor, padding_mask: Optional[PaddingMask] = None) -> Tensor:
			
 
				+        """Processes the input tensor x and returns an output tensor."""
			
 
				+        residual = x
			
 
				+        if self.shortcut:
			
 
				+            residual = self.shortcut(x)
			
 
				+
			
 
				+        x = self.tdnn1(x)
			
 
				+        x = self.res2net_block(x)
			
 
				+        x = self.tdnn2(x)
			
 
				+        x = self.se_block(x, padding_mask=padding_mask)
			
 
				+
			
 
				+        return x + residual
			
--- a/src/seamless_communication/models/pretssel/ecapa_tdnn_builder.py
+++ b/src/seamless_communication/models/pretssel/ecapa_tdnn_builder.py
@@ -0,0 +1,112 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+from dataclasses import dataclass
			
 
				+from typing import List, Optional
			
 
				+
			
 
				+from fairseq2.models.utils.arch_registry import ArchitectureRegistry
			
 
				+from fairseq2.typing import DataType, Device
			
 
				+
			
 
				+from seamless_communication.models.pretssel.ecapa_tdnn import ECAPA_TDNN
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class EcapaTDNNConfig:
			
 
				+    channels: List[int]
			
 
				+    kernel_sizes: List[int]
			
 
				+    dilations: List[int]
			
 
				+    attention_channels: int
			
 
				+    res2net_scale: int
			
 
				+    se_channels: int
			
 
				+    global_context: bool
			
 
				+    groups: List[int]
			
 
				+    embed_dim: int
			
 
				+    input_dim: int
			
 
				+
			
 
				+
			
 
				+ecapa_tdnn_archs = ArchitectureRegistry[EcapaTDNNConfig]("ecapa_tdnn")
			
 
				+
			
 
				+ecapa_tdnn_arch = ecapa_tdnn_archs.marker
			
 
				+
			
 
				+
			
 
				+@ecapa_tdnn_arch("base")
			
 
				+def _base_ecapa_tdnn() -> EcapaTDNNConfig:
			
 
				+    return EcapaTDNNConfig(
			
 
				+        channels=[512, 512, 512, 512, 1536],
			
 
				+        kernel_sizes=[5, 3, 3, 3, 1],
			
 
				+        dilations=[1, 2, 3, 4, 1],
			
 
				+        attention_channels=128,
			
 
				+        res2net_scale=8,
			
 
				+        se_channels=128,
			
 
				+        global_context=True,
			
 
				+        groups=[1, 1, 1, 1, 1],
			
 
				+        embed_dim=512,
			
 
				+        input_dim=80,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+class EcapaTDNNBuilder:
			
 
				+    """
			
 
				+    Builder module for ECAPA_TDNN model
			
 
				+    """
			
 
				+
			
 
				+    config: EcapaTDNNConfig
			
 
				+    device: Optional[Device]
			
 
				+    dtype: Optional[DataType]
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: EcapaTDNNConfig,
			
 
				+        *,
			
 
				+        device: Optional[Device] = None,
			
 
				+        dtype: Optional[DataType] = None,
			
 
				+    ) -> None:
			
 
				+        """
			
 
				+        :param config:
			
 
				+            The configuration to use.
			
 
				+        :param devicev:
			
 
				+            The device on which to initialize modules.
			
 
				+        :param dtype:
			
 
				+            The data type of module parameters and buffers.
			
 
				+        """
			
 
				+        self.config = config
			
 
				+
			
 
				+        self.device, self.dtype = device, dtype
			
 
				+
			
 
				+    def build_model(self) -> ECAPA_TDNN:
			
 
				+        """Build a model."""
			
 
				+        model = ECAPA_TDNN(
			
 
				+            self.config.channels,
			
 
				+            self.config.kernel_sizes,
			
 
				+            self.config.dilations,
			
 
				+            self.config.attention_channels,
			
 
				+            self.config.res2net_scale,
			
 
				+            self.config.se_channels,
			
 
				+            self.config.global_context,
			
 
				+            self.config.groups,
			
 
				+            self.config.embed_dim,
			
 
				+            self.config.input_dim,
			
 
				+        )
			
 
				+        model.to(device=self.device, dtype=self.dtype)
			
 
				+        return model
			
 
				+
			
 
				+
			
 
				+def create_ecapa_tdnn_model(
			
 
				+    config: EcapaTDNNConfig,
			
 
				+    device: Optional[Device] = None,
			
 
				+    dtype: Optional[DataType] = None,
			
 
				+) -> ECAPA_TDNN:
			
 
				+    """Create a ECAPA_TDNN model.
			
 
				+
			
 
				+    :param config:
			
 
				+        The configuration to use.
			
 
				+    :param device:
			
 
				+        The device on which to initialize modules.
			
 
				+    :param dtype:
			
 
				+        The data type of module parameters and buffers.
			
 
				+    """
			
 
				+
			
 
				+    return EcapaTDNNBuilder(config, device=device, dtype=dtype).build_model()
			
--- a/src/seamless_communication/models/unity/__init__.py
+++ b/src/seamless_communication/models/unity/__init__.py
@@ -20,6 +20,7 @@ from seamless_communication.models.unity.char_tokenizer import (
 
				 from seamless_communication.models.unity.char_tokenizer import (
			
 
				     load_unity_char_tokenizer as load_unity_char_tokenizer,
			
 
				 )
			
 
				+from seamless_communication.models.unity.film import FiLM
			
 
				 from seamless_communication.models.unity.length_regulator import (
			
 
				     HardUpsampling as HardUpsampling,
			
 
				 )
			
--- a/src/seamless_communication/models/unity/builder.py
+++ b/src/seamless_communication/models/unity/builder.py
@@ -14,15 +14,23 @@ from fairseq2.models.w2vbert import w2vbert_archs
 
				 from fairseq2.models.wav2vec2 import Wav2Vec2EncoderBuilder, Wav2Vec2EncoderConfig
			
 
				 from fairseq2.nn.projection import TiedProjection
			
 
				 from fairseq2.nn.transformer import (
			
 
				+    FeedForwardNetwork,
			
 
				     MultiheadAttention,
			
 
				     StandardFeedForwardNetwork,
			
 
				     StandardMultiheadAttention,
			
 
				     TransformerEncoder,
			
 
				     TransformerEncoderLayer,
			
 
				+    TransformerNormOrder,
			
 
				     create_default_sdpa,
			
 
				 )
			
 
				-from fairseq2.typing import DataType, Device
			
 
				+from fairseq2.typing import DataType, Device, override
			
 
				+from torch.nn import GELU, ReLU
			
 
				 
			
 
				+from seamless_communication.models.pretssel import (
			
 
				+    EcapaTDNNBuilder,
			
 
				+    EcapaTDNNConfig,
			
 
				+    ecapa_tdnn_archs,
			
 
				+)
			
 
				 from seamless_communication.models.unity.adaptor_block import (
			
 
				     UnitYConformerAdaptorLayer,
			
 
				     UnitYEncoderAdaptor,
			
@@ -59,12 +67,19 @@ class UnitYConfig:
 
				     t2u_config: Optional[UnitYT2UConfig]
			
 
				     """The configuration of the UnitY T2U sub-model."""
			
 
				 
			
 
				+    prosody_encoder_config: Optional[EcapaTDNNConfig]
			
 
				+    """The configuration of the expressive prosody encoder."""
			
 
				+
			
 
				     use_text_encoder: bool
			
 
				     """If ``True``, uses an aligned MT encoder for the MT task."""
			
 
				 
			
 
				     use_conformer_adaptor: bool
			
 
				     """If ``True``, uses a Conformer-based adaptor block."""
			
 
				 
			
 
				+    use_gelu: bool
			
 
				+    """If ``True``, uses GELU activation function in feed-forward networks of
			
 
				+    adaptor blocks and decoder layers."""
			
 
				+
			
 
				     num_adaptor_layers: int
			
 
				     """The number of Transformer encoder layers in the adaptor block."""
			
 
				 
			
@@ -103,8 +118,10 @@ def _base() -> UnitYConfig:
 
				         w2v2_encoder_config=w2vbert_config.w2v2_config.encoder_config,
			
 
				         mt_model_config=mt_model_config,
			
 
				         t2u_config=t2u_config,
			
 
				+        prosody_encoder_config=None,
			
 
				         use_text_encoder=True,
			
 
				         use_conformer_adaptor=False,
			
 
				+        use_gelu=False,
			
 
				         num_adaptor_layers=1,
			
 
				         adaptor_kernel_size=8,
			
 
				         adaptor_stride=8,
			
@@ -128,8 +145,10 @@ def _medium() -> UnitYConfig:
 
				         w2v2_encoder_config=w2vbert_config.w2v2_config.encoder_config,
			
 
				         mt_model_config=mt_model_config,
			
 
				         t2u_config=t2u_config,
			
 
				+        prosody_encoder_config=None,
			
 
				         use_text_encoder=True,
			
 
				         use_conformer_adaptor=False,
			
 
				+        use_gelu=False,
			
 
				         num_adaptor_layers=1,
			
 
				         adaptor_kernel_size=8,
			
 
				         adaptor_stride=8,
			
@@ -155,8 +174,43 @@ def _base_v2() -> UnitYConfig:
 
				         w2v2_encoder_config=w2v2_chunk_encoder_config,
			
 
				         mt_model_config=mt_model_config,
			
 
				         t2u_config=t2u_config,
			
 
				+        prosody_encoder_config=None,
			
 
				         use_text_encoder=True,
			
 
				         use_conformer_adaptor=False,
			
 
				+        use_gelu=False,
			
 
				+        num_adaptor_layers=1,
			
 
				+        adaptor_kernel_size=8,
			
 
				+        adaptor_stride=8,
			
 
				+        adaptor_layer_norm=True,
			
 
				+        adaptor_dropout_p=0.1,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+@unity_arch("expressivity_v2")
			
 
				+def _expressivity_v2() -> UnitYConfig:
			
 
				+    w2v2_chunk_encoder_config = wav2vec2_chunk_archs.get_config("600m")
			
 
				+
			
 
				+    mt_model_config: NllbConfig = nllb_archs.get_config("dense_1b")
			
 
				+
			
 
				+    mt_model_config.vocab_info.size = 256102  # NLLB-100
			
 
				+
			
 
				+    mt_model_config.vocab_info.pad_idx = 1
			
 
				+
			
 
				+    mt_model_config.max_seq_len = 4000
			
 
				+
			
 
				+    t2u_config = unity_t2u_archs.get_config("expressivity_nar")
			
 
				+
			
 
				+    prosody_encoder_config = ecapa_tdnn_archs.get_config("base")
			
 
				+
			
 
				+    return UnitYConfig(
			
 
				+        model_dim=1024,
			
 
				+        w2v2_encoder_config=w2v2_chunk_encoder_config,
			
 
				+        mt_model_config=mt_model_config,
			
 
				+        t2u_config=t2u_config,
			
 
				+        prosody_encoder_config=prosody_encoder_config,
			
 
				+        use_text_encoder=False,
			
 
				+        use_conformer_adaptor=False,
			
 
				+        use_gelu=True,
			
 
				         num_adaptor_layers=1,
			
 
				         adaptor_kernel_size=8,
			
 
				         adaptor_stride=8,
			
@@ -176,6 +230,7 @@ class UnitYBuilder:
 
				     w2v2_encoder_builder: Wav2Vec2EncoderBuilder
			
 
				     mt_model_builder: NllbBuilder
			
 
				     t2u_builder: Union[UnitYT2UBuilder, UnitYNART2UBuilder, None]
			
 
				+    prosody_encoder_builder: Optional[EcapaTDNNBuilder]
			
 
				     device: Optional[Device]
			
 
				     dtype: Optional[DataType]
			
 
				 
			
@@ -185,6 +240,7 @@ class UnitYBuilder:
 
				         w2v2_encoder_builder: Wav2Vec2EncoderBuilder,
			
 
				         mt_model_builder: NllbBuilder,
			
 
				         t2u_builder: Union[UnitYT2UBuilder, UnitYNART2UBuilder, None],
			
 
				+        prosody_encoder_builder: Optional[EcapaTDNNBuilder],
			
 
				         *,
			
 
				         device: Optional[Device] = None,
			
 
				         dtype: Optional[DataType] = None,
			
@@ -223,6 +279,7 @@ class UnitYBuilder:
 
				         self.w2v2_encoder_builder = w2v2_encoder_builder
			
 
				         self.mt_model_builder = mt_model_builder
			
 
				         self.t2u_builder = t2u_builder
			
 
				+        self.prosody_encoder_builder = prosody_encoder_builder
			
 
				 
			
 
				         self.device, self.dtype = device, dtype
			
 
				 
			
@@ -251,6 +308,11 @@ class UnitYBuilder:
 
				         else:
			
 
				             t2u_model = self.t2u_builder.build_model()
			
 
				 
			
 
				+        if self.prosody_encoder_builder is None:
			
 
				+            prosody_encoder_model = None
			
 
				+        else:
			
 
				+            prosody_encoder_model = self.prosody_encoder_builder.build_model()
			
 
				+
			
 
				         return UnitYModel(
			
 
				             speech_encoder_frontend,
			
 
				             speech_encoder,
			
@@ -261,6 +323,7 @@ class UnitYBuilder:
 
				             final_proj,
			
 
				             t2u_model,
			
 
				             self.config.mt_model_config.vocab_info,
			
 
				+            prosody_encoder_model,
			
 
				         )
			
 
				 
			
 
				     def build_speech_encoder(self) -> TransformerEncoder:
			
@@ -292,11 +355,10 @@ class UnitYBuilder:
 
				             self.w2v2_encoder_builder.config.num_encoder_attn_heads
			
 
				         )
			
 
				 
			
 
				-        # Unlike wav2vec2, we use ReLU (i.e. standard FFN activation function)
			
 
				-        # instead of GELU.
			
 
				         ffn = StandardFeedForwardNetwork(
			
 
				             self.config.model_dim,
			
 
				             self.w2v2_encoder_builder.config.ffn_inner_dim,
			
 
				+            inner_activation=GELU() if self.config.use_gelu else ReLU(),
			
 
				             bias=True,
			
 
				             device=self.device,
			
 
				             dtype=self.dtype,
			
@@ -365,6 +427,20 @@ class UnitYBuilder:
 
				         )
			
 
				 
			
 
				 
			
 
				+class NllbWithGELUBuilder(NllbBuilder):
			
 
				+    @override
			
 
				+    def build_ffn(self) -> FeedForwardNetwork:
			
 
				+        return StandardFeedForwardNetwork(
			
 
				+            self.config.model_dim,
			
 
				+            self.config.ffn_inner_dim,
			
 
				+            bias=True,
			
 
				+            inner_activation=GELU(),
			
 
				+            norm_order=TransformerNormOrder.PRE,
			
 
				+            device=self.device,
			
 
				+            dtype=self.dtype,
			
 
				+        )
			
 
				+
			
 
				+
			
 
				 def create_unity_model(
			
 
				     config: UnitYConfig,
			
 
				     device: Optional[Device] = None,
			
@@ -397,12 +473,28 @@ def create_unity_model(
 
				     else:
			
 
				         t2u_builder = UnitYNART2UBuilder(config.t2u_config, device=device, dtype=dtype)
			
 
				 
			
 
				-    mt_model_builder = NllbBuilder(config.mt_model_config, device=device, dtype=dtype)
			
 
				+    if config.prosody_encoder_config is None:
			
 
				+        prosody_encoder_builder = None
			
 
				+    else:
			
 
				+        prosody_encoder_builder = EcapaTDNNBuilder(
			
 
				+            config.prosody_encoder_config, device=device, dtype=dtype
			
 
				+        )
			
 
				+
			
 
				+    if config.use_gelu:
			
 
				+        mt_model_builder: NllbBuilder = NllbWithGELUBuilder(
			
 
				+            config.mt_model_config, device=device, dtype=dtype
			
 
				+        )
			
 
				+    else:
			
 
				+        mt_model_builder = NllbBuilder(
			
 
				+            config.mt_model_config, device=device, dtype=dtype
			
 
				+        )
			
 
				+
			
 
				     unity_builder = UnitYBuilder(
			
 
				         config,
			
 
				         w2v2_encoder_builder,
			
 
				         mt_model_builder,
			
 
				         t2u_builder,
			
 
				+        prosody_encoder_builder,
			
 
				         device=device,
			
 
				         dtype=dtype,
			
 
				     )
			
--- a/src/seamless_communication/models/unity/film.py
+++ b/src/seamless_communication/models/unity/film.py
@@ -0,0 +1,68 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+from typing import Optional
			
 
				+
			
 
				+import torch
			
 
				+from fairseq2.nn.projection import Linear
			
 
				+from fairseq2.typing import DataType, Device
			
 
				+from torch import Tensor
			
 
				+from torch.nn import Module, Parameter
			
 
				+
			
 
				+
			
 
				+class FiLM(Module):
			
 
				+    """
			
 
				+    A Feature-wise Linear Modulation Layer from
			
 
				+    'FiLM: Visual Reasoning with a General Conditioning Layer'
			
 
				+    """
			
 
				+
			
 
				+    proj: Linear
			
 
				+    s_gamma: Parameter
			
 
				+    s_beta: Parameter
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        cond_dim: int,
			
 
				+        embed_dim: int,
			
 
				+        device: Optional[Device] = None,
			
 
				+        dtype: Optional[DataType] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.proj = Linear(
			
 
				+            cond_dim, 2 * embed_dim, bias=True, device=device, dtype=dtype
			
 
				+        )
			
 
				+
			
 
				+        self.s_gamma = Parameter(
			
 
				+            torch.ones(
			
 
				+                1,
			
 
				+                device=device,
			
 
				+                dtype=dtype,
			
 
				+            ),
			
 
				+            requires_grad=True,
			
 
				+        )
			
 
				+
			
 
				+        self.s_beta = Parameter(
			
 
				+            torch.ones(
			
 
				+                1,
			
 
				+                device=device,
			
 
				+                dtype=dtype,
			
 
				+            ),
			
 
				+            requires_grad=True,
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, x: Tensor, cond_embs: Tensor) -> Tensor:
			
 
				+        """
			
 
				+        x -- [B, T, H]
			
 
				+        cond_emb -- [B, 1, C]
			
 
				+        """
			
 
				+        # get trainable gamma, beta
			
 
				+        gammas, betas = self.proj(cond_embs).chunk(2, dim=-1)  # B x 1 x H
			
 
				+
			
 
				+        # apply film
			
 
				+        gammas = self.s_gamma * gammas.expand_as(x)
			
 
				+        betas = self.s_beta * betas.expand_as(x)
			
 
				+
			
 
				+        return (gammas + 1.0) * x + betas  # type: ignore[no-any-return]
			
--- a/src/seamless_communication/models/unity/length_regulator.py
+++ b/src/seamless_communication/models/unity/length_regulator.py
@@ -14,6 +14,8 @@ from fairseq2.typing import DataType, Device
 
				 from torch import Tensor
			
 
				 from torch.nn import Conv1d, Dropout, Module, ReLU, Sequential
			
 
				 
			
 
				+from seamless_communication.models.unity.film import FiLM
			
 
				+
			
 
				 
			
 
				 class HardUpsampling(Module):
			
 
				     """Upsamples sequences in a deterministic way as governed by durations."""
			
@@ -46,6 +48,7 @@ class VariancePredictor(Module):
 
				     conv2: Sequential
			
 
				     ln2: LayerNorm
			
 
				     proj: Linear
			
 
				+    film: Optional[FiLM]
			
 
				 
			
 
				     def __init__(
			
 
				         self,
			
@@ -54,6 +57,8 @@ class VariancePredictor(Module):
 
				         var_pred_kernel_size: int,
			
 
				         var_pred_dropout: float,
			
 
				         bias: bool = True,
			
 
				+        use_film: bool = False,
			
 
				+        film_cond_dim: int = 512,
			
 
				         device: Optional[Device] = None,
			
 
				         dtype: Optional[DataType] = None,
			
 
				     ):
			
@@ -99,7 +104,19 @@ class VariancePredictor(Module):
 
				             var_pred_hidden_dim, 1, bias=True, device=device, dtype=dtype
			
 
				         )
			
 
				 
			
 
				-    def forward(self, seqs: Tensor, padding_mask: Optional[PaddingMask]) -> Tensor:
			
 
				+        if use_film:
			
 
				+            self.film = FiLM(
			
 
				+                film_cond_dim, var_pred_hidden_dim, device=device, dtype=dtype
			
 
				+            )
			
 
				+        else:
			
 
				+            self.register_module("film", None)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        seqs: Tensor,
			
 
				+        padding_mask: Optional[PaddingMask],
			
 
				+        film_cond_emb: Optional[Tensor] = None,
			
 
				+    ) -> Tensor:
			
 
				         # Ensure that we do not leak padded positions in the convolution layer.
			
 
				         seqs = apply_padding_mask(seqs, padding_mask)
			
 
				 
			
@@ -131,6 +148,12 @@ class VariancePredictor(Module):
 
				 
			
 
				         seqs = self.dropout_module(seqs)
			
 
				 
			
 
				+        seqs = apply_padding_mask(seqs, padding_mask)
			
 
				+
			
 
				+        if self.film is not None and film_cond_emb is not None:
			
 
				+            seqs = self.film(seqs, film_cond_emb)
			
 
				+            seqs = apply_padding_mask(seqs, padding_mask)
			
 
				+
			
 
				         # (N, S, H) -> (N, S, 1) -> (N, S)
			
 
				         seqs = self.proj(seqs).squeeze(dim=2)
			
 
				 
			
@@ -174,8 +197,9 @@ class VarianceAdaptor(Module):
 
				         padding_mask: Optional[PaddingMask],
			
 
				         duration_factor: float = 1.0,
			
 
				         min_duration: int = 0,
			
 
				+        film_cond_emb: Optional[Tensor] = None,
			
 
				     ) -> Tuple[Tensor, PaddingMask]:
			
 
				-        log_durations = self.duration_predictor(seqs, padding_mask)
			
 
				+        log_durations = self.duration_predictor(seqs, padding_mask, film_cond_emb)
			
 
				 
			
 
				         durations = torch.clamp(
			
 
				             torch.round((torch.exp(log_durations) - 1) * duration_factor).long(),
			
--- a/src/seamless_communication/models/unity/loader.py
+++ b/src/seamless_communication/models/unity/loader.py
@@ -47,10 +47,16 @@ class UnitYLoader(ModelLoader[UnitYModel, UnitYConfig]):
 
				 
			
 
				         keys_to_delete = []
			
 
				 
			
 
				+        # ExpressiveUnitY model (from multi_arch codebase)
			
 
				+        if config.prosody_encoder_config is not None:
			
 
				+            encoder_key = "s2t_model.encoder"
			
 
				+            decoder_key = "s2t_model.decoder"
			
 
				+            t2u_decoder_key = "t2s_model.decoder"
			
 
				         # X2T/S2T + T2U model.
			
 
				-        if config.t2u_config is not None:
			
 
				+        elif config.t2u_config is not None:
			
 
				             encoder_key = "encoder"
			
 
				             decoder_key = "target_letter_decoder"
			
 
				+            t2u_decoder_key = "decoder"
			
 
				         # X2T model.
			
 
				         elif config.use_text_encoder:
			
 
				             encoder_key = "speech_encoder"
			
@@ -70,12 +76,18 @@ class UnitYLoader(ModelLoader[UnitYModel, UnitYConfig]):
 
				         # Remnant of wav2vec2 pretraining, not needed for eval or fine-tuning.
			
 
				         keys_to_delete.append(f"{encoder_key}.w2v_encoder.w2v_model.mask_emb")
			
 
				 
			
 
				-        keys_to_delete.append("decoder.char_upsampler.embed_positions._float_tensor")
			
 
				-        keys_to_delete.append("decoder.char_upsampler.embed_tokens_char.weight")
			
 
				+        keys_to_delete.append(
			
 
				+            f"{t2u_decoder_key}.char_upsampler.embed_positions._float_tensor"
			
 
				+        )
			
 
				+        keys_to_delete.append(
			
 
				+            f"{t2u_decoder_key}.char_upsampler.embed_tokens_char.weight"
			
 
				+        )
			
 
				 
			
 
				         # Delete AlignmentEncoder keys for inference.
			
 
				         alignment_encoder_keys = [
			
 
				-            key for key in state_dict if key.startswith("decoder.alignment_encoder.")
			
 
				+            key
			
 
				+            for key in state_dict
			
 
				+            if key.startswith(f"{t2u_decoder_key}.alignment_encoder.")
			
 
				         ]
			
 
				         keys_to_delete.extend(alignment_encoder_keys)
			
 
				 
			
@@ -87,6 +99,17 @@ class UnitYLoader(ModelLoader[UnitYModel, UnitYConfig]):
 
				             ]
			
 
				         )
			
 
				 
			
 
				+        if config.prosody_encoder_config is not None:
			
 
				+            keys_to_delete.extend(
			
 
				+                [
			
 
				+                    f"{t2u_decoder_key}.embed_positions._float_tensor",
			
 
				+                    "t2s_model.global_proj_dec.weight",
			
 
				+                    "t2s_model.global_proj_dec.bias",
			
 
				+                    "t2s_model.decoder_target_letter_nllb_spm_decoder.encoder.proj.weight",
			
 
				+                    "t2s_model.decoder_target_letter_nllb_spm_decoder.encoder.proj.bias",
			
 
				+                ]
			
 
				+            )
			
 
				+
			
 
				         for key in keys_to_delete:
			
 
				             if key in state_dict:
			
 
				                 del state_dict[key]
			
@@ -157,10 +180,19 @@ class UnitYLoader(ModelLoader[UnitYModel, UnitYConfig]):
 
				 
			
 
				     @staticmethod
			
 
				     def _fairseq_key_map(config: UnitYConfig) -> Dict[str, str]:
			
 
				+        # ExpressiveUnitY model (from multi_arch codebase)
			
 
				+        if config.prosody_encoder_config is not None:
			
 
				+            encoder_key = "s2t_model.encoder"
			
 
				+            decoder_key = "s2t_model.decoder"
			
 
				+            t2u_encoder_key = "t2s_model.encoder"
			
 
				+            t2u_decoder_key = "t2s_model.decoder"
			
 
				+            ecapa_tdnn_key = "global_prosody"
			
 
				         # X2T/S2T + T2U model.
			
 
				-        if config.t2u_config is not None:
			
 
				+        elif config.t2u_config is not None:
			
 
				             encoder_key = "encoder"
			
 
				             decoder_key = "target_letter_decoder"
			
 
				+            t2u_encoder_key = "synthesizer_encoder"
			
 
				+            t2u_decoder_key = "decoder"
			
 
				         # X2T model.
			
 
				         elif config.use_text_encoder:
			
 
				             encoder_key = "speech_encoder"
			
@@ -231,8 +263,8 @@ class UnitYLoader(ModelLoader[UnitYModel, UnitYConfig]):
 
				         # fairseq was accidentally run with a pre-LN encoder, and ended up with
			
 
				         # a redundant `LayerNorm` right after the Conformer blocks. We mitigate
			
 
				         # that issue here by moving that `LayerNorm` to the adaptor block.
			
 
				+        # fmt: off
			
 
				         if config.w2v2_encoder_config.use_conformer:
			
 
				-            # fmt: off
			
 
				             key_map.update(
			
 
				                 {
			
 
				                     fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layer_norm\.": r"speech_encoder.inner_layer_norm."
			
@@ -244,7 +276,7 @@ class UnitYLoader(ModelLoader[UnitYModel, UnitYConfig]):
 
				                     rf"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layer_norm\.": r"speech_encoder.inner.layer_norm."
			
 
				                 }
			
 
				             )
			
 
				-            # fmt: on
			
 
				+        # fmt: on
			
 
				 
			
 
				         if config.use_conformer_adaptor:
			
 
				             key_map.update(
			
@@ -303,44 +335,56 @@ class UnitYLoader(ModelLoader[UnitYModel, UnitYConfig]):
 
				                 # fmt: on
			
 
				             }
			
 
				         )
			
 
				+        # ExpressiveUnitY model (from multi_arch codebase)
			
 
				+        if config.prosody_encoder_config is not None:
			
 
				+            key_map.update(
			
 
				+                {
			
 
				+                    # fmt: off
			
 
				+                    fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.film\.":              r"t2u_model.decoder.layers.\1.film.",
			
 
				+                    fr"^{ecapa_tdnn_key}\.":                                       r"prosody_encoder_model.",
			
 
				+                    r"^t2s_model\.global_proj_enc\.":                             r"t2u_model.prosody_proj.",
			
 
				+                    # fmt: on
			
 
				+                }
			
 
				+            )
			
 
				+
			
 
				         # X2T/S2T + T2U model.
			
 
				         if config.t2u_config is not None:
			
 
				             key_map.update(
			
 
				                 {
			
 
				                     # fmt: off
			
 
				                     # T2U Encoder
			
 
				-                    r"^synthesizer_encoder\.layers\.([0-9]+)\.self_attn\.out_proj\.":     r"t2u_model.encoder.layers.\1.self_attn.output_proj.",
			
 
				-                    r"^synthesizer_encoder\.layers\.([0-9]+)\.self_attn\.":               r"t2u_model.encoder.layers.\1.self_attn.",
			
 
				-                    r"^synthesizer_encoder\.layers\.([0-9]+)\.self_attn_layer_norm\.":    r"t2u_model.encoder.layers.\1.self_attn_layer_norm.",
			
 
				-                    r"^synthesizer_encoder\.layers\.([0-9]+)\.fc1\.":                     r"t2u_model.encoder.layers.\1.ffn.inner_proj.",
			
 
				-                    r"^synthesizer_encoder\.layers\.([0-9]+)\.fc2\.":                     r"t2u_model.encoder.layers.\1.ffn.output_proj.",
			
 
				-                    r"^synthesizer_encoder\.layers\.([0-9]+)\.final_layer_norm\.":        r"t2u_model.encoder.layers.\1.ffn_layer_norm.",
			
 
				-                    r"^synthesizer_encoder\.layer_norm\.":                                r"t2u_model.encoder.layer_norm.",
			
 
				+                    fr"^{t2u_encoder_key}\.layers\.([0-9]+)\.self_attn\.out_proj\.":     r"t2u_model.encoder.layers.\1.self_attn.output_proj.",
			
 
				+                    fr"^{t2u_encoder_key}\.layers\.([0-9]+)\.self_attn\.":               r"t2u_model.encoder.layers.\1.self_attn.",
			
 
				+                    fr"^{t2u_encoder_key}\.layers\.([0-9]+)\.self_attn_layer_norm\.":    r"t2u_model.encoder.layers.\1.self_attn_layer_norm.",
			
 
				+                    fr"^{t2u_encoder_key}\.layers\.([0-9]+)\.fc1\.":                     r"t2u_model.encoder.layers.\1.ffn.inner_proj.",
			
 
				+                    fr"^{t2u_encoder_key}\.layers\.([0-9]+)\.fc2\.":                     r"t2u_model.encoder.layers.\1.ffn.output_proj.",
			
 
				+                    fr"^{t2u_encoder_key}\.layers\.([0-9]+)\.final_layer_norm\.":        r"t2u_model.encoder.layers.\1.ffn_layer_norm.",
			
 
				+                    fr"^{t2u_encoder_key}\.layer_norm\.":                                r"t2u_model.encoder.layer_norm.",
			
 
				 
			
 
				                     # T2U Decoder frontend
			
 
				-                    r"^decoder\.embed_tokens_text\.":                           r"t2u_model.decoder_frontend.embed_char.",
			
 
				-                    r"^decoder\.embed_tokens_unit\.":                           r"t2u_model.decoder_frontend.embed.",
			
 
				-                    r"^decoder\.embed_tokens\.":                                r"t2u_model.decoder_frontend.embed.",
			
 
				-                    r"^decoder\.var_adaptor\.duration_predictor\.":             r"t2u_model.decoder_frontend.variance_adaptor.duration_predictor.",
			
 
				-                    r"^decoder\.dec_pos_emb_alpha":                             r"t2u_model.decoder_frontend.pos_emb_alpha",
			
 
				-                    r"^decoder\.char_upsampler\.pos_emb_alpha":                 r"t2u_model.decoder_frontend.pos_emb_alpha_char",
			
 
				+                    fr"^{t2u_decoder_key}\.embed_tokens_text\.":                           r"t2u_model.decoder_frontend.embed_char.",
			
 
				+                    fr"^{t2u_decoder_key}\.embed_tokens_unit\.":                           r"t2u_model.decoder_frontend.embed.",
			
 
				+                    fr"^{t2u_decoder_key}\.embed_tokens\.":                                r"t2u_model.decoder_frontend.embed.",
			
 
				+                    fr"^{t2u_decoder_key}\.var_adaptor\.duration_predictor\.":             r"t2u_model.decoder_frontend.variance_adaptor.duration_predictor.",
			
 
				+                    fr"^{t2u_decoder_key}\.dec_pos_emb_alpha":                             r"t2u_model.decoder_frontend.pos_emb_alpha",
			
 
				+                    fr"^{t2u_decoder_key}\.char_upsampler\.pos_emb_alpha":                 r"t2u_model.decoder_frontend.pos_emb_alpha_char",
			
 
				 
			
 
				                     # T2U Decoder
			
 
				-                    r"^decoder\.layers\.([0-9]+)\.self_attn\.out_proj\.":     r"t2u_model.decoder.layers.\1.self_attn.output_proj.",
			
 
				-                    r"^decoder\.layers\.([0-9]+)\.self_attn\.":               r"t2u_model.decoder.layers.\1.self_attn.",
			
 
				-                    r"^decoder\.layers\.([0-9]+)\.self_attn_layer_norm\.":    r"t2u_model.decoder.layers.\1.self_attn_layer_norm.",
			
 
				-                    r"^decoder\.layers\.([0-9]+)\.layer_norm\.":              r"t2u_model.decoder.layers.\1.self_attn_layer_norm.",
			
 
				-                    r"^decoder\.layers\.([0-9]+)\.encoder_attn\.out_proj\.":  r"t2u_model.decoder.layers.\1.encoder_decoder_attn.output_proj.",
			
 
				-                    r"^decoder\.layers\.([0-9]+)\.encoder_attn\.":            r"t2u_model.decoder.layers.\1.encoder_decoder_attn.",
			
 
				-                    r"^decoder\.layers\.([0-9]+)\.encoder_attn_layer_norm\.": r"t2u_model.decoder.layers.\1.encoder_decoder_attn_layer_norm.",
			
 
				-                    r"^decoder\.layers\.([0-9]+)\.fc1\.":                     r"t2u_model.decoder.layers.\1.ffn.inner_proj.",
			
 
				-                    r"^decoder\.layers\.([0-9]+)\.fc2\.":                     r"t2u_model.decoder.layers.\1.ffn.output_proj.",
			
 
				-                    r"^decoder\.layers\.([0-9]+)\.final_layer_norm\.":        r"t2u_model.decoder.layers.\1.ffn_layer_norm.",
			
 
				-                    r"^decoder\.layers\.([0-9]+)\.ffn\.ffn\.0\.":             r"t2u_model.decoder.layers.\1.conv1d.conv1.",
			
 
				-                    r"^decoder\.layers\.([0-9]+)\.ffn\.ffn\.2\.":             r"t2u_model.decoder.layers.\1.conv1d.conv2.",
			
 
				-                    r"^decoder\.layers\.([0-9]+)\.ffn\.layer_norm\.":         r"t2u_model.decoder.layers.\1.conv1d_layer_norm.",
			
 
				-                    r"^decoder\.layer_norm\.":                                r"t2u_model.decoder.layer_norm.",
			
 
				-                    r"^decoder\.output_projection\.":                         r"t2u_model.final_proj.",
			
 
				+                    fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.self_attn\.out_proj\.":     r"t2u_model.decoder.layers.\1.self_attn.output_proj.",
			
 
				+                    fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.self_attn\.":               r"t2u_model.decoder.layers.\1.self_attn.",
			
 
				+                    fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.self_attn_layer_norm\.":    r"t2u_model.decoder.layers.\1.self_attn_layer_norm.",
			
 
				+                    fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.layer_norm\.":              r"t2u_model.decoder.layers.\1.self_attn_layer_norm.",
			
 
				+                    fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.encoder_attn\.out_proj\.":  r"t2u_model.decoder.layers.\1.encoder_decoder_attn.output_proj.",
			
 
				+                    fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.encoder_attn\.":            r"t2u_model.decoder.layers.\1.encoder_decoder_attn.",
			
 
				+                    fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.encoder_attn_layer_norm\.": r"t2u_model.decoder.layers.\1.encoder_decoder_attn_layer_norm.",
			
 
				+                    fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.fc1\.":                     r"t2u_model.decoder.layers.\1.ffn.inner_proj.",
			
 
				+                    fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.fc2\.":                     r"t2u_model.decoder.layers.\1.ffn.output_proj.",
			
 
				+                    fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.final_layer_norm\.":        r"t2u_model.decoder.layers.\1.ffn_layer_norm.",
			
 
				+                    fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.ffn\.ffn\.0\.":             r"t2u_model.decoder.layers.\1.conv1d.conv1.",
			
 
				+                    fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.ffn\.ffn\.2\.":             r"t2u_model.decoder.layers.\1.conv1d.conv2.",
			
 
				+                    fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.ffn\.layer_norm\.":         r"t2u_model.decoder.layers.\1.conv1d_layer_norm.",
			
 
				+                    fr"^{t2u_decoder_key}\.layer_norm\.":                                r"t2u_model.decoder.layer_norm.",
			
 
				+                    fr"^{t2u_decoder_key}\.output_projection\.":                         r"t2u_model.final_proj.",
			
 
				                     # fmt: on
			
 
				                 }
			
 
				             )
			
--- a/src/seamless_communication/models/unity/model.py
+++ b/src/seamless_communication/models/unity/model.py
@@ -19,6 +19,7 @@ from overrides import final as finaloverride
 
				 from torch import Tensor
			
 
				 from torch.nn import Module
			
 
				 
			
 
				+from seamless_communication.models.pretssel.ecapa_tdnn import ECAPA_TDNN
			
 
				 from seamless_communication.models.unity.nar_decoder import NARTransformerDecoder
			
 
				 from seamless_communication.models.unity.nar_decoder_frontend import NARDecoderFrontend
			
 
				 
			
@@ -42,6 +43,7 @@ class UnitYModel(EncoderDecoderModel):
 
				     text_decoder: TransformerDecoder
			
 
				     final_proj: Projection
			
 
				     t2u_model: Union["UnitYT2UModel", "UnitYNART2UModel", None]
			
 
				+    prosody_encoder_model: Optional[ECAPA_TDNN]
			
 
				 
			
 
				     def __init__(
			
 
				         self,
			
@@ -54,6 +56,7 @@ class UnitYModel(EncoderDecoderModel):
 
				         final_proj: Projection,
			
 
				         t2u_model: Union["UnitYT2UModel", "UnitYNART2UModel", None],
			
 
				         target_vocab_info: VocabularyInfo,
			
 
				+        prosody_encoder_model: Optional[ECAPA_TDNN] = None,
			
 
				         input_modality: str = "speech",
			
 
				     ) -> None:
			
 
				         model_dim = speech_encoder.model_dim
			
@@ -93,6 +96,10 @@ class UnitYModel(EncoderDecoderModel):
 
				             self.register_module("t2u_model", None)
			
 
				 
			
 
				         self.target_vocab_info = target_vocab_info
			
 
				+        if prosody_encoder_model is not None:
			
 
				+            self.prosody_encoder_model = prosody_encoder_model
			
 
				+        else:
			
 
				+            self.register_module("prosody_encoder_model", None)
			
 
				 
			
 
				     @finaloverride
			
 
				     def encode(
			
@@ -304,6 +311,7 @@ class UnitYNART2UModel(Module):
 
				     decoder: NARTransformerDecoder
			
 
				     final_proj: Projection
			
 
				     target_vocab_info: VocabularyInfo
			
 
				+    prosody_proj: Optional[Projection]
			
 
				 
			
 
				     def __init__(
			
 
				         self,
			
@@ -312,6 +320,7 @@ class UnitYNART2UModel(Module):
 
				         decoder: NARTransformerDecoder,
			
 
				         final_proj: Projection,
			
 
				         target_vocab_info: VocabularyInfo,
			
 
				+        prosody_proj: Optional[Projection] = None,
			
 
				     ) -> None:
			
 
				         super().__init__()
			
 
				 
			
@@ -339,20 +348,27 @@ class UnitYNART2UModel(Module):
 
				 
			
 
				         self.target_vocab_info = target_vocab_info
			
 
				 
			
 
				+        self.prosody_proj = prosody_proj
			
 
				+
			
 
				     def forward(
			
 
				         self,
			
 
				         text_decoder_output: Tensor,
			
 
				         text_decoder_padding_mask: Optional[PaddingMask],
			
 
				         text_seqs: Optional[Tensor],
			
 
				+        film_cond_emb: Optional[Tensor] = None,
			
 
				     ) -> Tuple[SequenceModelOutput, Optional[PaddingMask]]:
			
 
				         encoder_output, encoder_padding_mask = self.encode(
			
 
				             text_decoder_output, text_decoder_padding_mask
			
 
				         )
			
 
				 
			
 
				+        if self.prosody_proj is not None and film_cond_emb is not None:
			
 
				+            encoder_output = encoder_output + self.prosody_proj(film_cond_emb)
			
 
				+
			
 
				         decoder_output, decoder_padding_mask = self.decode(
			
 
				             encoder_output,
			
 
				             encoder_padding_mask,
			
 
				             text_seqs,
			
 
				+            film_cond_emb,
			
 
				         )
			
 
				 
			
 
				         return self.project(decoder_output), decoder_padding_mask
			
@@ -372,14 +388,15 @@ class UnitYNART2UModel(Module):
 
				         encoder_output: Tensor,
			
 
				         encoder_padding_mask: Optional[PaddingMask],
			
 
				         text_seqs: Optional[Tensor],
			
 
				+        film_cond_emb: Optional[Tensor] = None,
			
 
				     ) -> Tuple[Tensor, Optional[PaddingMask]]:
			
 
				         # encoder_output: (N, S, M)
			
 
				         # text_seqs: (N, S)
			
 
				         seqs, padding_mask = self.decoder_frontend(
			
 
				-            encoder_output, encoder_padding_mask, text_seqs
			
 
				+            encoder_output, encoder_padding_mask, text_seqs, film_cond_emb
			
 
				         )
			
 
				 
			
 
				-        return self.decoder(seqs, padding_mask)  # type: ignore[no-any-return]
			
 
				+        return self.decoder(seqs, padding_mask, film_cond_emb=film_cond_emb)  # type: ignore[no-any-return]
			
 
				 
			
 
				     def project(self, decoder_output: Tensor) -> SequenceModelOutput:
			
 
				         logits = self.final_proj(decoder_output)
			
--- a/src/seamless_communication/models/unity/nar_decoder.py
+++ b/src/seamless_communication/models/unity/nar_decoder.py
@@ -66,9 +66,10 @@ class NARTransformerDecoder(Module):
 
				         self,
			
 
				         seqs: Tensor,
			
 
				         padding_mask: Optional[PaddingMask],
			
 
				+        film_cond_emb: Optional[Tensor] = None,
			
 
				     ) -> Tuple[Tensor, Optional[PaddingMask]]:
			
 
				         for layer in self.layers.drop_iter():
			
 
				-            seqs, padding_mask = layer(seqs, padding_mask)
			
 
				+            seqs, padding_mask = layer(seqs, padding_mask, film_cond_emb=film_cond_emb)
			
 
				 
			
 
				         if self.layer_norm is not None:
			
 
				             seqs = self.layer_norm(seqs)
			
--- a/src/seamless_communication/models/unity/nar_decoder_frontend.py
+++ b/src/seamless_communication/models/unity/nar_decoder_frontend.py
@@ -302,6 +302,7 @@ class NARDecoderFrontend(Module):
 
				         encoder_output: Tensor,
			
 
				         encoder_padding_mask: Optional[PaddingMask],
			
 
				         text_seqs: Optional[Tensor],
			
 
				+        film_cond_emb: Optional[Tensor] = None,
			
 
				     ) -> Tuple[Tensor, Optional[PaddingMask]]:
			
 
				         assert text_seqs is not None
			
 
				 
			
@@ -323,6 +324,7 @@ class NARDecoderFrontend(Module):
 
				             seqs,
			
 
				             encoder_padding_mask,
			
 
				             min_duration=1,
			
 
				+            film_cond_emb=film_cond_emb,
			
 
				         )
			
 
				 
			
 
				         seqs = self.forward_unit_pos_embedding(seqs, padding_mask)
			
--- a/src/seamless_communication/models/unity/nar_decoder_layer.py
+++ b/src/seamless_communication/models/unity/nar_decoder_layer.py
@@ -13,6 +13,8 @@ from fairseq2.typing import DataType, Device, finaloverride
 
				 from torch import Tensor
			
 
				 from torch.nn import Conv1d, Dropout, Module, ReLU
			
 
				 
			
 
				+from seamless_communication.models.unity.film import FiLM
			
 
				+
			
 
				 
			
 
				 @final
			
 
				 class Conv1dBlock(Module):
			
@@ -111,6 +113,7 @@ class NARTransformerDecoderLayer(Module):
 
				     conv1d: Conv1dBlock
			
 
				     conv1d_dropout: Optional[Dropout]
			
 
				     conv1d_layer_norm: LayerNorm
			
 
				+    film: Optional[FiLM]
			
 
				 
			
 
				     def __init__(
			
 
				         self,
			
@@ -118,6 +121,8 @@ class NARTransformerDecoderLayer(Module):
 
				         conv1d: Conv1dBlock,
			
 
				         dropout_p: float = 0.1,
			
 
				         conv1d_dropout_p: float = 0.1,
			
 
				+        use_film: bool = False,
			
 
				+        film_cond_dim: int = 512,
			
 
				         device: Optional[Device] = None,
			
 
				         dtype: Optional[DataType] = None,
			
 
				     ) -> None:
			
@@ -130,6 +135,10 @@ class NARTransformerDecoderLayer(Module):
 
				             The dropout probability on the outputs of the self attention layer.
			
 
				         :param conv1d_dropout_p:
			
 
				             The dropout probability on the outputs of the conv1d block.
			
 
				+        :param use_film:
			
 
				+            Whether to condition on a fixed-size vector through FiLM.
			
 
				+        :param film_cond_dim:
			
 
				+            The dim of fixed-size vector conditioned on during model forward.
			
 
				         """
			
 
				         super().__init__()
			
 
				 
			
@@ -159,16 +168,26 @@ class NARTransformerDecoderLayer(Module):
 
				             self.model_dim, device=device, dtype=dtype
			
 
				         )
			
 
				 
			
 
				+        if use_film:
			
 
				+            self.film = FiLM(film_cond_dim, self.model_dim, device=device, dtype=dtype)
			
 
				+        else:
			
 
				+            self.register_module("film", None)
			
 
				+
			
 
				     @finaloverride
			
 
				     def forward(
			
 
				         self,
			
 
				         seqs: Tensor,
			
 
				         padding_mask: Optional[PaddingMask],
			
 
				+        film_cond_emb: Optional[Tensor] = None,
			
 
				     ) -> Tuple[Tensor, Optional[PaddingMask]]:
			
 
				         seqs = self._forward_self_attn(seqs, padding_mask)
			
 
				 
			
 
				         seqs = self._forward_conv1d(seqs, padding_mask)
			
 
				 
			
 
				+        if self.film is not None and film_cond_emb is not None:
			
 
				+            seqs = self.film(seqs, film_cond_emb)
			
 
				+            seqs = apply_padding_mask(seqs, padding_mask)
			
 
				+
			
 
				         return seqs, padding_mask
			
 
				 
			
 
				     def _forward_self_attn(
			
--- a/src/seamless_communication/models/unity/t2u_builder.py
+++ b/src/seamless_communication/models/unity/t2u_builder.py
@@ -17,7 +17,7 @@ from fairseq2.models.transformer import (
 
				 from fairseq2.models.utils.arch_registry import ArchitectureRegistry
			
 
				 from fairseq2.nn.embedding import Embedding, StandardEmbedding, init_scaled_embedding
			
 
				 from fairseq2.nn.position_encoder import SinusoidalPositionEncoder
			
 
				-from fairseq2.nn.projection import TiedProjection
			
 
				+from fairseq2.nn.projection import Linear, Projection, TiedProjection
			
 
				 from fairseq2.nn.transformer import (
			
 
				     FeedForwardNetwork,
			
 
				     MultiheadAttention,
			
@@ -35,6 +35,7 @@ from fairseq2.nn.transformer import (
 
				     create_default_sdpa,
			
 
				 )
			
 
				 from fairseq2.typing import DataType, Device
			
 
				+from torch.nn import GELU, ReLU
			
 
				 
			
 
				 from seamless_communication.models.unity.char_tokenizer import load_unity_char_tokenizer
			
 
				 from seamless_communication.models.unity.length_regulator import (
			
@@ -55,6 +56,8 @@ class VariancePredictorConfig:
 
				     var_pred_hidden_dim: int
			
 
				     var_pred_kernel_size: int
			
 
				     var_pred_dropout: float
			
 
				+    use_film: bool
			
 
				+    film_cond_dim: int
			
 
				 
			
 
				 
			
 
				 @dataclass
			
@@ -73,6 +76,8 @@ class NARDecoderConfig:
 
				     conv1d_kernel_size: int
			
 
				     conv1d_inner_dim: int
			
 
				     conv1d_dropout_p: float
			
 
				+    use_film: bool
			
 
				+    film_cond_dim: int
			
 
				 
			
 
				 
			
 
				 @dataclass
			
@@ -113,9 +118,17 @@ class UnitYT2UConfig:
 
				     dropout_p: float
			
 
				     """The dropout probability in Transformer layers."""
			
 
				 
			
 
				-    def update_unit_vocabulary(self, info: VocabularyInfo) -> None:
			
 
				-        """Update unit vocabulary configuration from ``info``."""
			
 
				-        self.unit_vocabulary_size, self.unit_pad_idx = info.size, info.pad_idx
			
 
				+    use_gelu: bool
			
 
				+    """If ``True``, uses GELU activation function in feed-forward networks."""
			
 
				+
			
 
				+    char_pad_idx: int
			
 
				+    """The index of the pad symbol in the char vocabulary."""
			
 
				+
			
 
				+    use_prosody_proj: bool
			
 
				+    """If ``True``, uses a prosody projection layer."""
			
 
				+
			
 
				+    prosody_encoder_dim: int
			
 
				+    """The dimensionality of prosody encoder (e.g. ECAPA_TDNN) output"""
			
 
				 
			
 
				 
			
 
				 unity_t2u_archs = ArchitectureRegistry[UnitYT2UConfig]("unity_t2u")
			
@@ -140,6 +153,10 @@ def _base_t2u() -> UnitYT2UConfig:
 
				         num_decoder_attn_heads=16,
			
 
				         ffn_inner_dim=1024 * 8,
			
 
				         dropout_p=0.1,
			
 
				+        use_gelu=False,
			
 
				+        char_pad_idx=0,
			
 
				+        use_prosody_proj=False,
			
 
				+        prosody_encoder_dim=0,
			
 
				     )
			
 
				 
			
 
				 
			
@@ -159,6 +176,10 @@ def _medium_t2u() -> UnitYT2UConfig:
 
				         num_decoder_attn_heads=16,
			
 
				         ffn_inner_dim=1024 * 8,
			
 
				         dropout_p=0.1,
			
 
				+        use_gelu=False,
			
 
				+        char_pad_idx=0,
			
 
				+        use_prosody_proj=False,
			
 
				+        prosody_encoder_dim=0,
			
 
				     )
			
 
				 
			
 
				 
			
@@ -168,6 +189,8 @@ def _base_nar() -> UnitYT2UConfig:
 
				         var_pred_hidden_dim=256,
			
 
				         var_pred_kernel_size=3,
			
 
				         var_pred_dropout=0.5,
			
 
				+        use_film=False,
			
 
				+        film_cond_dim=0,
			
 
				     )
			
 
				 
			
 
				     nar_decoder_frontend_config = NARDecoderFrontendConfig(
			
@@ -184,6 +207,8 @@ def _base_nar() -> UnitYT2UConfig:
 
				         conv1d_kernel_size=7,
			
 
				         conv1d_inner_dim=1024,
			
 
				         conv1d_dropout_p=0.1,
			
 
				+        use_film=False,
			
 
				+        film_cond_dim=0,
			
 
				     )
			
 
				 
			
 
				     return UnitYT2UConfig(
			
@@ -200,6 +225,59 @@ def _base_nar() -> UnitYT2UConfig:
 
				         num_decoder_attn_heads=16,
			
 
				         ffn_inner_dim=1024 * 8,
			
 
				         dropout_p=0.0,
			
 
				+        use_gelu=False,
			
 
				+        char_pad_idx=0,
			
 
				+        use_prosody_proj=False,
			
 
				+        prosody_encoder_dim=0,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+@unity_t2u_arch("expressivity_nar")
			
 
				+def _expressivity_nar() -> UnitYT2UConfig:
			
 
				+    duration_predictor_config = VariancePredictorConfig(
			
 
				+        var_pred_hidden_dim=256,
			
 
				+        var_pred_kernel_size=3,
			
 
				+        var_pred_dropout=0.5,
			
 
				+        use_film=True,
			
 
				+        film_cond_dim=512,
			
 
				+    )
			
 
				+
			
 
				+    nar_decoder_frontend_config = NARDecoderFrontendConfig(
			
 
				+        subword_to_unit_upsampling_type="hard",
			
 
				+        duration_predictor_config=duration_predictor_config,
			
 
				+        pitch_predictor_config=None,
			
 
				+        energy_predictor_config=None,
			
 
				+    )
			
 
				+
			
 
				+    nar_decoder_config = NARDecoderConfig(
			
 
				+        model_name_or_card="seamless_expressivity",
			
 
				+        char_vocabulary_size=10904,
			
 
				+        char_max_seq_len=4000,
			
 
				+        conv1d_kernel_size=7,
			
 
				+        conv1d_inner_dim=1024,
			
 
				+        conv1d_dropout_p=0.1,
			
 
				+        use_film=True,
			
 
				+        film_cond_dim=512,
			
 
				+    )
			
 
				+
			
 
				+    return UnitYT2UConfig(
			
 
				+        model_dim=1024,
			
 
				+        unit_max_seq_len=4000,
			
 
				+        target_vocab_info=VocabularyInfo(
			
 
				+            size=10005, unk_idx=3, bos_idx=0, eos_idx=2, pad_idx=1
			
 
				+        ),
			
 
				+        num_encoder_layers=4,
			
 
				+        num_decoder_layers=4,
			
 
				+        nar_decoder_frontend_config=nar_decoder_frontend_config,
			
 
				+        nar_decoder_config=nar_decoder_config,
			
 
				+        num_encoder_attn_heads=16,
			
 
				+        num_decoder_attn_heads=16,
			
 
				+        ffn_inner_dim=1024 * 8,
			
 
				+        dropout_p=0.0,
			
 
				+        use_gelu=True,
			
 
				+        char_pad_idx=1,
			
 
				+        use_prosody_proj=True,
			
 
				+        prosody_encoder_dim=512,
			
 
				     )
			
 
				 
			
 
				 
			
@@ -417,12 +495,15 @@ class UnitYNART2UBuilder:
 
				 
			
 
				         decoder_frontend = self.build_decoder_frontend(embed_unit)
			
 
				 
			
 
				+        prosody_proj = self.build_prosody_proj()
			
 
				+
			
 
				         return UnitYNART2UModel(
			
 
				             encoder,
			
 
				             decoder_frontend,
			
 
				             decoder,
			
 
				             final_proj,
			
 
				             self.config.target_vocab_info,
			
 
				+            prosody_proj=prosody_proj,
			
 
				         )
			
 
				 
			
 
				     def build_unit_embedding(self) -> StandardEmbedding:
			
@@ -482,6 +563,8 @@ class UnitYNART2UBuilder:
 
				             duration_predictor_config.var_pred_hidden_dim,
			
 
				             duration_predictor_config.var_pred_kernel_size,
			
 
				             duration_predictor_config.var_pred_dropout,
			
 
				+            use_film=duration_predictor_config.use_film,
			
 
				+            film_cond_dim=duration_predictor_config.film_cond_dim,
			
 
				             device=self.device,
			
 
				             dtype=self.dtype,
			
 
				         )
			
@@ -518,19 +601,18 @@ class UnitYNART2UBuilder:
 
				         nllb_tokenizer = NllbTokenizerLoader(asset_store, download_manager)(
			
 
				             self.config.nar_decoder_config.model_name_or_card
			
 
				         )
			
 
				-        text_pad_idx = nllb_tokenizer.vocab_info.pad_idx
			
 
				 
			
 
				         char_pos_encoder = SinusoidalPositionEncoder(
			
 
				             self.config.model_dim,
			
 
				             self.config.nar_decoder_config.char_max_seq_len,
			
 
				-            _legacy_pad_idx=text_pad_idx,
			
 
				+            _legacy_pad_idx=self.config.char_pad_idx,
			
 
				             device=self.device,
			
 
				         )
			
 
				 
			
 
				         embed_char = StandardEmbedding(
			
 
				             num_embeddings=self.config.nar_decoder_config.char_vocabulary_size,
			
 
				             embedding_dim=self.config.model_dim,
			
 
				-            pad_idx=text_pad_idx,
			
 
				+            pad_idx=self.config.char_pad_idx,
			
 
				             init_fn=init_scaled_embedding,
			
 
				             device=self.device,
			
 
				             dtype=self.dtype,
			
@@ -584,6 +666,8 @@ class UnitYNART2UBuilder:
 
				             conv1d,
			
 
				             dropout_p=self.config.dropout_p,
			
 
				             conv1d_dropout_p=self.config.nar_decoder_config.conv1d_dropout_p,
			
 
				+            use_film=self.config.nar_decoder_config.use_film,
			
 
				+            film_cond_dim=self.config.nar_decoder_config.film_cond_dim,
			
 
				             device=self.device,
			
 
				             dtype=self.dtype,
			
 
				         )
			
@@ -608,11 +692,26 @@ class UnitYNART2UBuilder:
 
				             self.config.model_dim,
			
 
				             self.config.ffn_inner_dim,
			
 
				             bias=True,
			
 
				+            inner_activation=GELU() if self.config.use_gelu else ReLU(),
			
 
				             norm_order=TransformerNormOrder.PRE,
			
 
				             device=self.device,
			
 
				             dtype=self.dtype,
			
 
				         )
			
 
				 
			
 
				+    def build_prosody_proj(self) -> Optional[Projection]:
			
 
				+        """Build a prosody projection layer if needed"""
			
 
				+
			
 
				+        if self.config.use_prosody_proj:
			
 
				+            return Linear(
			
 
				+                self.config.prosody_encoder_dim,
			
 
				+                self.config.model_dim,
			
 
				+                bias=True,
			
 
				+                dtype=self.dtype,
			
 
				+                device=self.device,
			
 
				+            )
			
 
				+        else:
			
 
				+            return None
			
 
				+
			
 
				 
			
 
				 def create_unity_t2u_model(
			
 
				     config: UnitYT2UConfig,