1 жил өмнө · 283f74250f
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@ setup(
 
				     license="Creative Commons",
			
 
				     install_requires=[
			
 
				         "datasets",
			
 
				-        "fairseq2==0.2.*",
			
 
				+#        "fairseq2==0.2.*",
			
 
				         "fire",
			
 
				         "librosa",
			
 
				         "openai-whisper",
			
--- a/src/seamless_communication/cards/conformer_shaw.yaml
+++ b/src/seamless_communication/cards/conformer_shaw.yaml
@@ -5,6 +5,6 @@
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				 name: conformer_shaw
			
 
				-model_type: wav2vec2
			
 
				+model_family: conformer_shaw
			
 
				 model_arch: conformer_shaw_600m
			
 
				 checkpoint: "https://huggingface.co/facebook/conformer-shaw/resolve/main/conformer_shaw.pt"
			
--- a/src/seamless_communication/cards/nar_t2u_aligner.yaml
+++ b/src/seamless_communication/cards/nar_t2u_aligner.yaml
@@ -6,7 +6,7 @@
 
				 
			
 
				 name: nar_t2u_aligner
			
 
				 char_tokenizer: "https://huggingface.co/facebook/seamless-streaming/resolve/main/spm_char_lang38_tc.model"
			
 
				-model_type: unity2_aligner
			
 
				+model_family: unity2_aligner
			
 
				 model_arch: nar_t2u_aligner
			
 
				 checkpoint: "https://dl.fbaipublicfiles.com/seamless/models/unity2_aligner.pt"
			
 
				 num_units: 10000
			
--- a/src/seamless_communication/cards/seamless_streaming_monotonic_decoder.yaml
+++ b/src/seamless_communication/cards/seamless_streaming_monotonic_decoder.yaml
@@ -5,6 +5,6 @@
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				 name: seamless_streaming_monotonic_decoder
			
 
				-model_type: monotonic_decoder
			
 
				+model_family: monotonic_decoder
			
 
				 model_arch: dense_1b
			
 
				 checkpoint: "https://huggingface.co/facebook/seamless-streaming/resolve/main/seamless_streaming_monotonic_decoder.pt"
			
--- a/src/seamless_communication/cards/unity_nllb-100.yaml
+++ b/src/seamless_communication/cards/unity_nllb-100.yaml
@@ -5,7 +5,7 @@
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				 name: unity_nllb-100
			
 
				-model_type: unity
			
 
				+model_family: unity
			
 
				 tokenizer: "https://huggingface.co/facebook/seamless-m4t-large/resolve/main/tokenizer.model"
			
 
				 default_lang: eng
			
 
				 langs:
			
--- a/src/seamless_communication/cards/unity_nllb-200.yaml
+++ b/src/seamless_communication/cards/unity_nllb-200.yaml
@@ -5,7 +5,7 @@
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				 name: unity_nllb-200
			
 
				-model_type: unity
			
 
				+model_family: unity
			
 
				 tokenizer: "https://huggingface.co/facebook/seamless-m4t-medium/resolve/main/tokenizer.model"
			
 
				 default_lang: eng
			
 
				 langs:
			
--- a/src/seamless_communication/cards/vocoder_36langs.yaml
+++ b/src/seamless_communication/cards/vocoder_36langs.yaml
@@ -5,7 +5,7 @@
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				 name: vocoder_36langs
			
 
				-model_type: vocoder_code_hifigan
			
 
				+model_family: vocoder_code_hifigan
			
 
				 model_arch: base
			
 
				 checkpoint: "https://huggingface.co/facebook/seamless-m4t-vocoder/resolve/main/vocoder_36langs.pt"
			
 
				 model_config: {
			
--- a/src/seamless_communication/cards/vocoder_pretssel.yaml
+++ b/src/seamless_communication/cards/vocoder_pretssel.yaml
@@ -5,7 +5,7 @@
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				 name: vocoder_pretssel
			
 
				-model_type: vocoder_pretssel
			
 
				+model_family: vocoder_pretssel
			
 
				 model_arch: 24khz
			
 
				 checkpoint: "https://github.com/facebookresearch/seamless_communication;gated=true"
			
 
				 sample_rate: 24000
			
--- a/src/seamless_communication/cards/vocoder_pretssel_16khz.yaml
+++ b/src/seamless_communication/cards/vocoder_pretssel_16khz.yaml
@@ -5,7 +5,7 @@
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				 name: vocoder_pretssel_16khz
			
 
				-model_type: vocoder_pretssel
			
 
				+model_family: vocoder_pretssel
			
 
				 model_arch: 16khz
			
 
				 checkpoint: "https://github.com/facebookresearch/seamless_communication;gated=true"
			
 
				 sample_rate: 16000
			
--- a/src/seamless_communication/cards/vocoder_v2.yaml
+++ b/src/seamless_communication/cards/vocoder_v2.yaml
@@ -5,7 +5,7 @@
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				 name: vocoder_v2
			
 
				-model_type: vocoder_code_hifigan
			
 
				+model_family: vocoder_code_hifigan
			
 
				 model_arch: base
			
 
				 checkpoint: "https://dl.fbaipublicfiles.com/seamless/models/vocoder_v2.pt"
			
 
				 model_config: {
			
--- a/src/seamless_communication/cards/xlsr2_1b_v2.yaml
+++ b/src/seamless_communication/cards/xlsr2_1b_v2.yaml
@@ -5,6 +5,6 @@
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				 name: xlsr2_1b_v2
			
 
				-model_type: wav2vec2
			
 
				+model_family: wav2vec2
			
 
				 model_arch: xlsr2_1b_v2
			
 
				 checkpoint: "https://dl.fbaipublicfiles.com/seamlessM4T/models/unit_extraction/xlsr2_1b_v2.pt"
			
--- a/src/seamless_communication/cli/m4t/audio_to_units/audio_to_units.py
+++ b/src/seamless_communication/cli/m4t/audio_to_units/audio_to_units.py
@@ -14,7 +14,7 @@ logging.basicConfig(level=logging.INFO)
 
				 logger = logging.getLogger(__name__)
			
 
				 
			
 
				 
			
 
				-def main():
			
 
				+def main() -> None:
			
 
				     parser = argparse.ArgumentParser(
			
 
				         description="Convert raw audio to units (and optionally audio) using UnitExtractor."
			
 
				     )
			
--- a/src/seamless_communication/cli/m4t/evaluate/evaluate.py
+++ b/src/seamless_communication/cli/m4t/evaluate/evaluate.py
@@ -19,7 +19,6 @@ import torchaudio
 
				 from fairseq2.data import Collater, DataPipeline, FileMapper
			
 
				 from fairseq2.data.audio import AudioDecoder, WaveformToFbankConverter
			
 
				 from fairseq2.data.text import StrSplitter, TextTokenizer, read_text
			
 
				-from fairseq2.data.typing import StringLike
			
 
				 from fairseq2.typing import DataType, Device
			
 
				 from torch import Tensor
			
 
				 from tqdm import tqdm
			
@@ -181,10 +180,10 @@ def build_data_pipeline(
 
				 
			
 
				 def adjust_output_for_corrupted_inputs(
			
 
				     valid_sequences: Tensor,
			
 
				-    text_output: List[StringLike],
			
 
				+    text_output: List[str],
			
 
				     speech_output: Optional[BatchedSpeechOutput],
			
 
				-) -> Tuple[List[StringLike], Optional[BatchedSpeechOutput]]:
			
 
				-    adjusted_text_output: List[StringLike] = []
			
 
				+) -> Tuple[List[str], Optional[BatchedSpeechOutput]]:
			
 
				+    adjusted_text_output: List[str] = []
			
 
				     adjusted_speech_output: Optional[BatchedSpeechOutput] = None
			
 
				 
			
 
				     if speech_output is not None:
			
--- a/src/seamless_communication/cli/m4t/finetune/dataloader.py
+++ b/src/seamless_communication/cli/m4t/finetune/dataloader.py
@@ -83,7 +83,7 @@ class BatchingConfig:
 
				     """Select between fp16/fp32 for float tensors """
			
 
				 
			
 
				 
			
 
				-def worker_init_fn(worker_id):
			
 
				+def worker_init_fn(worker_id) -> None:
			
 
				     np.random.seed(np.random.get_state()[1][0] + worker_id)
			
 
				 
			
 
				 
			
--- a/src/seamless_communication/cli/m4t/finetune/trainer.py
+++ b/src/seamless_communication/cli/m4t/finetune/trainer.py
@@ -116,12 +116,12 @@ class UnitYFinetuneWrapper(nn.Module):
 
				                 unit_encoder_out,
			
 
				                 unit_encoder_padding_mask,
			
 
				             ) = self.model.t2u_model.encode(
			
 
				-                text_decoder_output=text_decoder_out,
			
 
				-                text_decoder_padding_mask=text_decoder_padding_mask,
			
 
				+                text_decoder_out,
			
 
				+                text_decoder_padding_mask,
			
 
				             )
			
 
				             seqs = batch.text_to_units.prev_output_tokens.to(self.device)
			
 
				             seq_lens = batch.text_to_units.target_lengths.to(self.device)
			
 
				-            unit_decoder_out, _ = self.model.t2u_model.decode(
			
 
				+            unit_decoder_out = self.model.t2u_model.decode(
			
 
				                 seqs=seqs,
			
 
				                 padding_mask=PaddingMask(seq_lens, seqs.size(1)),
			
 
				                 encoder_output=unit_encoder_out,
			
@@ -156,7 +156,7 @@ class CalcLoss:
 
				             text_logits.device
			
 
				         )
			
 
				         s2t_loss = SequenceModelOutput(
			
 
				-            logits=text_logits, vocab_info=self.s2t_vocab_info
			
 
				+            text_logits, self.s2t_vocab_info.pad_idx
			
 
				         ).compute_loss(
			
 
				             targets=batch.speech_to_text.target_tokens.to(text_logits.device),
			
 
				             ignore_prefix_size=1,
			
@@ -167,7 +167,7 @@ class CalcLoss:
 
				         assert batch.text_to_units.target_lengths is not None
			
 
				         s2u_numel = torch.sum(batch.text_to_units.target_lengths).to(unit_logits.device)
			
 
				         s2u_loss = SequenceModelOutput(
			
 
				-            logits=unit_logits, vocab_info=self.t2u_vocab_info
			
 
				+            logits=unit_logits, vocab_info=self.t2u_vocab_info.pad_idx
			
 
				         ).compute_loss(
			
 
				             targets=batch.text_to_units.target_tokens.to(unit_logits.device),
			
 
				             ignore_prefix_size=1,
			
@@ -314,7 +314,7 @@ class UnitYFinetune:
 
				         eval_loss = loss_hist.reduce()
			
 
				         self._update_eval_stats(eval_loss)
			
 
				 
			
 
				-    def _train_step_log(self):
			
 
				+    def _train_step_log(self) -> None:
			
 
				         """Log train stats"""
			
 
				         if (self.update_idx + 1) % self.params.log_steps == 0:
			
 
				             avg_loss = self.train_loss_hist.reduce()
			
@@ -340,7 +340,7 @@ class UnitYFinetune:
 
				         self.train_loss_hist.update(1, loss.item())
			
 
				         self._train_step_log()
			
 
				 
			
 
				-    def _save_model(self):
			
 
				+    def _save_model(self) -> None:
			
 
				         logger.info("Saving model")
			
 
				         if dist_utils.is_main_process():
			
 
				             state_dict = {
			
@@ -351,7 +351,7 @@ class UnitYFinetune:
 
				         if dist_utils.is_dist_initialized():
			
 
				             dist.barrier()
			
 
				 
			
 
				-    def run(self):
			
 
				+    def run(self) -> None:
			
 
				         logger.info("Start finetuning")
			
 
				         self._reset_stats()
			
 
				         self._eval_model()
			
--- a/src/seamless_communication/cli/toxicity/asr_etox.py
+++ b/src/seamless_communication/cli/toxicity/asr_etox.py
@@ -207,7 +207,7 @@ def build_data_pipeline(
 
				 
			
 
				     pipeline_builder = read_text(data_file, rtrim=True).skip(1).map(split_tsv)
			
 
				 
			
 
				-    map_file = FileMapper(root_dir=audio_root_dir, cached_fd_count=10)
			
 
				+    map_file = FileMapper(root_dir=Path(audio_root_dir), cached_fd_count=10)
			
 
				 
			
 
				     pipeline_builder.map(
			
 
				         map_file,
			
--- a/src/seamless_communication/inference/generator.py
+++ b/src/seamless_communication/inference/generator.py
@@ -8,7 +8,7 @@ from dataclasses import dataclass
 
				 from typing import List, Optional, Tuple
			
 
				 
			
 
				 import torch
			
 
				-from fairseq2.data import SequenceData, StringLike
			
 
				+from fairseq2.data import SequenceData
			
 
				 from fairseq2.data.text import TextTokenizer
			
 
				 from fairseq2.generation import (
			
 
				     BeamSearchSeq2SeqGenerator,
			
@@ -137,6 +137,7 @@ class UnitYGenerator:
 
				             decoder_frontend=model.text_decoder_frontend,
			
 
				             decoder=model.text_decoder,
			
 
				             final_proj=model.final_proj,
			
 
				+            max_target_seq_len=model.max_target_seq_len,
			
 
				             target_vocab_info=model.target_vocab_info,
			
 
				         )
			
 
				 
			
@@ -169,6 +170,7 @@ class UnitYGenerator:
 
				                 decoder_frontend=model.text_decoder_frontend,
			
 
				                 decoder=model.text_decoder,
			
 
				                 final_proj=model.final_proj,
			
 
				+                max_target_seq_len=model.max_target_seq_len,
			
 
				                 target_vocab_info=model.target_vocab_info,
			
 
				             )
			
 
				             generator = BeamSearchSeq2SeqGenerator(
			
@@ -234,7 +236,7 @@ class UnitYGenerator:
 
				         ngram_filtering: bool = False,
			
 
				         duration_factor: float = 1.0,
			
 
				         prosody_encoder_input: Optional[SequenceData] = None,
			
 
				-    ) -> Tuple[List[StringLike], Optional[Tensor]]:
			
 
				+    ) -> Tuple[List[str], Optional[Tensor]]:
			
 
				         """
			
 
				         :param source_seqs:
			
 
				             The source sequences to use for generation. *Shape:* :math:`(N,S,*)`,
			
@@ -346,7 +348,7 @@ class UnitYGenerator:
 
				             unit_seqs = t2u_model_output.logits.argmax(dim=2)
			
 
				             # Apply the padding mask to the generated units.
			
 
				             unit_seqs = apply_padding_mask(
			
 
				-                unit_seqs, decoder_padding_mask, t2u_model_output.vocab_info.pad_idx
			
 
				+                unit_seqs, decoder_padding_mask, t2u_model_output.pad_idx
			
 
				             )
			
 
				 
			
 
				         # Convert to speech units.
			
--- a/src/seamless_communication/inference/translator.py
+++ b/src/seamless_communication/inference/translator.py
@@ -13,7 +13,7 @@ import torch
 
				 import torch.nn as nn
			
 
				 from fairseq2.assets import asset_store
			
 
				 from fairseq2.assets.card import AssetCard
			
 
				-from fairseq2.data import Collater, SequenceData, StringLike
			
 
				+from fairseq2.data import Collater, SequenceData
			
 
				 from fairseq2.data.audio import AudioDecoder, WaveformToFbankConverter
			
 
				 from fairseq2.data.text import TextTokenizer
			
 
				 from fairseq2.memory import MemoryBlock
			
@@ -169,7 +169,7 @@ class Translator(nn.Module):
 
				         unit_generation_ngram_filtering: bool = False,
			
 
				         duration_factor: float = 1.0,
			
 
				         prosody_encoder_input: Optional[SequenceData] = None,
			
 
				-    ) -> Tuple[List[StringLike], Optional[Tensor]]:
			
 
				+    ) -> Tuple[List[str], Optional[Tensor]]:
			
 
				         # We disregard unit generations opts for the NAR T2U decoder.
			
 
				         if output_modality != Modality.SPEECH or isinstance(
			
 
				             model.t2u_model, UnitYNART2UModel
			
@@ -226,8 +226,8 @@ class Translator(nn.Module):
 
				         unit_generation_ngram_filtering: bool = False,
			
 
				         duration_factor: float = 1.0,
			
 
				         prosody_encoder_input: Optional[SequenceData] = None,
			
 
				-        src_text: Optional[StringLike] = None,
			
 
				-    ) -> Tuple[List[StringLike], Optional[BatchedSpeechOutput]]:
			
 
				+        src_text: Optional[str] = None,
			
 
				+    ) -> Tuple[List[str], Optional[BatchedSpeechOutput]]:
			
 
				         """
			
 
				         The main method used to perform inference on all tasks.
			
 
				 
			
--- a/src/seamless_communication/models/aligner/alignment_extractor.py
+++ b/src/seamless_communication/models/aligner/alignment_extractor.py
@@ -12,7 +12,6 @@ import torch
 
				 import torch.nn as nn
			
 
				 import torchaudio
			
 
				 from fairseq2.typing import DataType, Device
			
 
				-from fairseq2.data.typing import StringLike
			
 
				 from torch import Tensor
			
 
				 
			
 
				 from seamless_communication.models.aligner.loader import load_unity2_alignment_model
			
@@ -82,7 +81,7 @@ class AlignmentExtractor(nn.Module):
 
				             audio = audio.mean(0)
			
 
				         assert (
			
 
				             audio.ndim == 1
			
 
				-        ), f"After channel averaging audio shape expected to be [Time] i.e. mono audio"
			
 
				+        ), "After channel averaging audio shape expected to be [Time] i.e. mono audio"
			
 
				         audio = audio.to(self.device, self.dtype)
			
 
				 
			
 
				         return audio
			
@@ -101,7 +100,7 @@ class AlignmentExtractor(nn.Module):
 
				         text: str,
			
 
				         plot: bool = False,
			
 
				         add_trailing_silence: bool = False,
			
 
				-    ) -> Tuple[Tensor, Tensor, List[StringLike]]:
			
 
				+    ) -> Tuple[Tensor, Tensor, List[str]]:
			
 
				         if isinstance(audio, Tensor) and not torch.is_floating_point(audio):
			
 
				             # we got units as audio arg
			
 
				             units = audio
			
@@ -137,11 +136,11 @@ class AlignmentExtractor(nn.Module):
 
				 
			
 
				         return alignment_durations, tokenized_text_ids, tokenized_text_tokens
			
 
				 
			
 
				-    def detokenize_text(self, tokenized_text_ids: Tensor) -> StringLike:
			
 
				+    def detokenize_text(self, tokenized_text_ids: Tensor) -> str:
			
 
				         return self.alignment_model.alignment_frontend.decode_text(tokenized_text_ids)
			
 
				 
			
 
				     def plot_alignment(
			
 
				-        self, audio: Tensor, text_tokens: List[StringLike], durations: Tensor
			
 
				+        self, audio: Tensor, text_tokens: List[str], durations: Tensor
			
 
				     ) -> None:
			
 
				         if not matplotlib_available:
			
 
				             raise RuntimeError(
			
--- a/src/seamless_communication/models/aligner/builder.py
+++ b/src/seamless_communication/models/aligner/builder.py
@@ -10,9 +10,9 @@ from typing import Optional, Union
 
				 import torch
			
 
				 from fairseq2.assets.card import AssetCard
			
 
				 from fairseq2.data.vocabulary_info import VocabularyInfo
			
 
				-from fairseq2.models.utils.arch_registry import ArchitectureRegistry
			
 
				+from fairseq2.models.architecture_registry import ModelArchitectureRegistry
			
 
				 from fairseq2.nn.embedding import StandardEmbedding, init_scaled_embedding
			
 
				-from fairseq2.typing import DataType, Device
			
 
				+from fairseq2.typing import CPU, DataType, Device
			
 
				 
			
 
				 from seamless_communication.models.aligner.model import (
			
 
				     UnitY2AlignmentEncoder,
			
@@ -56,7 +56,7 @@ class UnitY2AlignmentConfig:
 
				     alignment_frontend_config: UnitY2AlignmentFrontendConfig
			
 
				 
			
 
				 
			
 
				-aligner_archs = ArchitectureRegistry[UnitY2AlignmentConfig]("unity2_aligner")
			
 
				+aligner_archs = ModelArchitectureRegistry[UnitY2AlignmentConfig]()
			
 
				 
			
 
				 aligner_arch = aligner_archs.decorator
			
 
				 
			
@@ -90,14 +90,14 @@ def _aligner_nar_t2u() -> UnitY2AlignmentConfig:
 
				 class UnitY2AlignmentBuilder:
			
 
				     config: UnitY2AlignmentConfig
			
 
				     device: Optional[Device]
			
 
				-    dtype: DataType
			
 
				+    dtype: Optional[DataType]
			
 
				 
			
 
				     def __init__(
			
 
				         self,
			
 
				         config: UnitY2AlignmentConfig,
			
 
				         *,
			
 
				         device: Optional[Device] = None,
			
 
				-        dtype: DataType = torch.float32,
			
 
				+        dtype: Optional[DataType] = torch.float32,
			
 
				     ) -> None:
			
 
				         """
			
 
				         :param config:
			
@@ -155,7 +155,8 @@ class UnitY2AlignmentBuilder:
 
				             dropout=cfg.dropout,
			
 
				             temperature=cfg.temperature,
			
 
				             reduction_factor=cfg.reduction_factor,
			
 
				-            dtype=self.dtype,
			
 
				+            device=self.device or CPU,
			
 
				+            dtype=self.dtype or torch.float32,
			
 
				         )
			
 
				         alignment_encoder.training = training
			
 
				 
			
@@ -165,7 +166,7 @@ class UnitY2AlignmentBuilder:
 
				 def create_unity2_alignment_model(
			
 
				     config: UnitY2AlignmentConfig,
			
 
				     device: Optional[Device] = None,
			
 
				-    dtype: DataType = torch.float32,
			
 
				+    dtype: Optional[DataType] = torch.float32,
			
 
				 ) -> UnitY2AlignmentModel:
			
 
				     """Create a UnitY model.
			
 
				 
			
--- a/src/seamless_communication/models/aligner/loader.py
+++ b/src/seamless_communication/models/aligner/loader.py
@@ -4,24 +4,23 @@
 
				 # This source code is licensed under the license found in the
			
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				-from typing import Any, List, Mapping
			
 
				+from typing import Any, List, Dict
			
 
				 
			
 
				 import torch
			
 
				-from fairseq2.assets import asset_store, download_manager
			
 
				-from fairseq2.models.utils import ConfigLoader, ModelLoader
			
 
				+from fairseq2.models import setup_model_family
			
 
				 
			
 
				 from seamless_communication.models.aligner.builder import (
			
 
				     UnitY2AlignmentConfig,
			
 
				     aligner_archs,
			
 
				     create_unity2_alignment_model,
			
 
				 )
			
 
				-from seamless_communication.models.aligner.model import UnitY2AlignmentModel
			
 
				+from seamless_communication.models.aligner.model import UNITY2_ALIGNER_FAMILY
			
 
				 from seamless_communication.models.unity.char_tokenizer import load_unity_char_tokenizer
			
 
				 
			
 
				 
			
 
				 def convert_unity2_aligner_checkpoint(
			
 
				-    checkpoint: Mapping[str, Any], config: UnitY2AlignmentConfig
			
 
				-) -> Mapping[str, Any]:
			
 
				+    checkpoint: Dict[str, Any], config: UnitY2AlignmentConfig
			
 
				+) -> Dict[str, Any]:
			
 
				     if (
			
 
				         "model" in checkpoint
			
 
				         and "alignment_encoder.t_conv.1.weight" in checkpoint["model"]
			
@@ -74,15 +73,11 @@ def _get_char_index_mapping(config: UnitY2AlignmentConfig) -> List[int]:
 
				     return model_to_dict_mapping
			
 
				 
			
 
				 
			
 
				-load_unity2_alignment_config = ConfigLoader[UnitY2AlignmentConfig](
			
 
				-    asset_store, aligner_archs
			
 
				-)
			
 
				-
			
 
				-load_unity2_alignment_model = ModelLoader[UnitY2AlignmentModel, UnitY2AlignmentConfig](
			
 
				-    asset_store,
			
 
				-    download_manager,
			
 
				-    load_unity2_alignment_config,
			
 
				+load_unity2_alignment_model, load_unity2_alignment_config = setup_model_family(
			
 
				+    UNITY2_ALIGNER_FAMILY,
			
 
				+    UnitY2AlignmentConfig,
			
 
				     create_unity2_alignment_model,
			
 
				+    aligner_archs,
			
 
				     convert_unity2_aligner_checkpoint,
			
 
				     restrict_checkpoints=False,
			
 
				 )
			
--- a/src/seamless_communication/models/aligner/model.py
+++ b/src/seamless_communication/models/aligner/model.py
@@ -4,25 +4,27 @@
 
				 # This source code is licensed under the license found in the
			
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				-from typing import Any, List, Tuple, Union
			
 
				+from typing import Any, Final, List, Tuple, Union
			
 
				 
			
 
				 import numpy as np
			
 
				 import numpy.typing as npt
			
 
				 import torch
			
 
				 import torch.nn as nn
			
 
				 import torch.nn.functional as F
			
 
				-from fairseq2.data import CString
			
 
				+from fairseq2.models import Model
			
 
				 from fairseq2.nn.embedding import StandardEmbedding
			
 
				 from fairseq2.nn.padding import to_padding_mask
			
 
				-from fairseq2.typing import DataType
			
 
				+from fairseq2.typing import DataType, Device
			
 
				 from torch import Tensor
			
 
				 from torch.nn import Module
			
 
				 
			
 
				 from seamless_communication.models.unity.char_tokenizer import CharTokenizer
			
 
				 from seamless_communication.models.unity.unit_tokenizer import UnitTokenizer
			
 
				 
			
 
				+UNITY2_ALIGNER_FAMILY: Final = "unity2_aligner"
			
 
				 
			
 
				-class UnitY2AlignmentFrontend(Module):
			
 
				+
			
 
				+class UnitY2AlignmentFrontend(nn.Module):
			
 
				     def __init__(
			
 
				         self,
			
 
				         embed_text: StandardEmbedding,
			
@@ -53,7 +55,7 @@ class UnitY2AlignmentFrontend(Module):
 
				 
			
 
				     def tokenize_text_to_tokens(
			
 
				         self, text: str, add_trailing_silence: bool = False
			
 
				-    ) -> List[Union[CString, str]]:
			
 
				+    ) -> List[str]:
			
 
				         tokenized = self.encode_text.encode_as_tokens(text)
			
 
				         if add_trailing_silence:
			
 
				             tokenized = tokenized + [tokenized[0]]
			
@@ -90,6 +92,7 @@ class UnitY2AlignmentEncoder(Module):
 
				         dropout: float,
			
 
				         temperature: float,
			
 
				         reduction_factor: int,
			
 
				+        device: Device,
			
 
				         dtype: DataType,
			
 
				     ):
			
 
				         super().__init__()
			
@@ -101,7 +104,12 @@ class UnitY2AlignmentEncoder(Module):
 
				             if i < text_layers - 1:
			
 
				                 layers.append(
			
 
				                     nn.Conv1d(
			
 
				-                        embed_dim, embed_dim, kernel_size=3, padding=1, dtype=dtype
			
 
				+                        embed_dim,
			
 
				+                        embed_dim,
			
 
				+                        kernel_size=3,
			
 
				+                        padding=1,
			
 
				+                        device=device,
			
 
				+                        dtype=dtype,
			
 
				                     )
			
 
				                 )
			
 
				                 layers.append(nn.ReLU())
			
@@ -109,7 +117,12 @@ class UnitY2AlignmentEncoder(Module):
 
				             else:
			
 
				                 layers.append(
			
 
				                     nn.Conv1d(
			
 
				-                        embed_dim, embed_dim, kernel_size=1, padding=0, dtype=dtype
			
 
				+                        embed_dim,
			
 
				+                        embed_dim,
			
 
				+                        kernel_size=1,
			
 
				+                        padding=0,
			
 
				+                        device=device,
			
 
				+                        dtype=dtype,
			
 
				                     )
			
 
				                 )
			
 
				                 layers.append(nn.Dropout(p=dropout))
			
@@ -122,7 +135,12 @@ class UnitY2AlignmentEncoder(Module):
 
				             if i < feat_layers - 1:
			
 
				                 layers.append(
			
 
				                     nn.Conv1d(
			
 
				-                        input_dim, embed_dim, kernel_size=3, padding=1, dtype=dtype
			
 
				+                        input_dim,
			
 
				+                        embed_dim,
			
 
				+                        kernel_size=3,
			
 
				+                        padding=1,
			
 
				+                        device=device,
			
 
				+                        dtype=dtype,
			
 
				                     )
			
 
				                 )
			
 
				                 layers.append(nn.ReLU())
			
@@ -135,6 +153,7 @@ class UnitY2AlignmentEncoder(Module):
 
				                         kernel_size=1,
			
 
				                         padding=0,
			
 
				                         stride=reduction_factor,
			
 
				+                        device=device,
			
 
				                         dtype=dtype,
			
 
				                     )
			
 
				                 )
			
@@ -277,7 +296,7 @@ def viterbi_decode(
 
				     return durations
			
 
				 
			
 
				 
			
 
				-class UnitY2AlignmentModel(Module):
			
 
				+class UnitY2AlignmentModel(Model):
			
 
				     alignment_encoder: UnitY2AlignmentEncoder
			
 
				     alignment_frontend: UnitY2AlignmentFrontend
			
 
				 
			
@@ -286,7 +305,7 @@ class UnitY2AlignmentModel(Module):
 
				         alignment_frontend: UnitY2AlignmentFrontend,
			
 
				         alignment_encoder: UnitY2AlignmentEncoder,
			
 
				     ):
			
 
				-        super().__init__()
			
 
				+        super().__init__(UNITY2_ALIGNER_FAMILY)
			
 
				         self.alignment_frontend = alignment_frontend
			
 
				         self.alignment_encoder = alignment_encoder
			
 
				 
			
--- a/src/seamless_communication/models/conformer_shaw/builder.py
+++ b/src/seamless_communication/models/conformer_shaw/builder.py
@@ -4,13 +4,13 @@
 
				 # This source code is licensed under the license found in the
			
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				-from dataclasses import asdict, dataclass
			
 
				-from typing import Optional
			
 
				+from dataclasses import asdict, dataclass, field
			
 
				+from typing import Final, Optional
			
 
				 
			
 
				 from fairseq2.models.conformer import ConformerConvolution
			
 
				-from fairseq2.models.utils.arch_registry import ArchitectureRegistry
			
 
				+from fairseq2.models.architecture_registry import ModelArchitectureRegistry
			
 
				 from fairseq2.models.w2vbert import w2vbert_archs
			
 
				-from fairseq2.models.wav2vec2.builder import (
			
 
				+from fairseq2.models.wav2vec2 import (
			
 
				     Wav2Vec2Builder,
			
 
				     Wav2Vec2Config,
			
 
				     Wav2Vec2EncoderBuilder,
			
@@ -21,15 +21,17 @@ from fairseq2.models.wav2vec2.model import Wav2Vec2Model
 
				 from fairseq2.nn.transformer import SDPA, ShawRelativePositionSDPA, create_default_sdpa
			
 
				 from fairseq2.typing import DataType, Device
			
 
				 
			
 
				+CONFORMER_SHAW_FAMILY: Final = "conformer_shaw"
			
 
				+
			
 
				 
			
 
				 @dataclass
			
 
				 class ShawRelativePositionSDPAConfig:
			
 
				     """Holds the configuration of the :class:ShawRelativePositionSDPA module."""
			
 
				 
			
 
				-    max_left_rel_pos: int
			
 
				+    max_left_rel_pos: int = 64
			
 
				     """The left clipping value for relative positions."""
			
 
				 
			
 
				-    max_right_rel_pos: Optional[int]
			
 
				+    max_right_rel_pos: Optional[int] = 8
			
 
				     """The right clipping value for relative positions."""
			
 
				 
			
 
				     use_rel_pos_values: bool = False
			
@@ -40,18 +42,23 @@ class ShawRelativePositionSDPAConfig:
 
				 class ConformerShawEncoderConfig(Wav2Vec2EncoderConfig):
			
 
				     """Holds the configuration of a conformer shaw encoder."""
			
 
				 
			
 
				-    shaw_rel_pos_sdpa_config: Optional[ShawRelativePositionSDPAConfig]
			
 
				+    shaw_rel_pos_sdpa_config: Optional[ShawRelativePositionSDPAConfig] = None
			
 
				     """The parameters for ShawRelativePositionSDPA."""
			
 
				 
			
 
				 
			
 
				-conformer_shaw_archs = ArchitectureRegistry[ConformerShawEncoderConfig](
			
 
				-    "conformer_shaw"
			
 
				-)
			
 
				+@dataclass
			
 
				+class ConformerShawConfig(Wav2Vec2Config):
			
 
				+    """Holds the configuration of a conformer shaw model."""
			
 
				+
			
 
				+    encoder_config: ConformerShawEncoderConfig = field(
			
 
				+        default_factory=ConformerShawEncoderConfig
			
 
				+    )
			
 
				 
			
 
				-conformer_shaw_arch = conformer_shaw_archs.decorator
			
 
				 
			
 
				+conformer_shaw_archs = ModelArchitectureRegistry[ConformerShawConfig]()
			
 
				+
			
 
				+conformer_shaw_arch = conformer_shaw_archs.decorator
			
 
				 
			
 
				-@conformer_shaw_arch("600m")
			
 
				 def _conformer_shaw_600m_encoder() -> ConformerShawEncoderConfig:
			
 
				     w2vbert_config = w2vbert_archs.get_config("600m")
			
 
				     w2v2_encoder_config = w2vbert_config.w2v2_config.encoder_config
			
@@ -68,18 +75,20 @@ def _conformer_shaw_600m_encoder() -> ConformerShawEncoderConfig:
 
				     return conformer_shaw_encoder_config
			
 
				 
			
 
				 
			
 
				-@wav2vec2_arch("conformer_shaw_600m")
			
 
				-def _conformer_shaw_600m() -> Wav2Vec2Config:
			
 
				+@conformer_shaw_arch("conformer_shaw_600m")
			
 
				+def _conformer_shaw_600m() -> ConformerShawConfig:
			
 
				     encoder_config = _conformer_shaw_600m_encoder()
			
 
				 
			
 
				-    return Wav2Vec2Config(
			
 
				+    return ConformerShawConfig(
			
 
				         encoder_config,
			
 
				         final_dim=768,
			
 
				         final_proj_bias=True,
			
 
				         temporal_mask_span_len=10,
			
 
				         max_temporal_mask_prob=0.65,
			
 
				+        min_num_temporal_mask_spans=2,
			
 
				         spatial_mask_span_len=10,
			
 
				         max_spatial_mask_prob=0.0,
			
 
				+        min_num_spatial_mask_spans=2,
			
 
				         quantized_dim=768,
			
 
				         num_codebooks=2,
			
 
				         num_codebook_entries=320,
			
@@ -101,6 +110,8 @@ class ConformerShawEncoderBuilder(Wav2Vec2EncoderBuilder):
 
				     """
			
 
				 
			
 
				     config: ConformerShawEncoderConfig
			
 
				+    device: Optional[Device]
			
 
				+    dtype: Optional[DataType]
			
 
				 
			
 
				     def __init__(
			
 
				         self,
			
@@ -119,11 +130,15 @@ class ConformerShawEncoderBuilder(Wav2Vec2EncoderBuilder):
 
				         """
			
 
				         super().__init__(config, device=device, dtype=dtype)
			
 
				 
			
 
				+        self.config = config
			
 
				+
			
 
				         assert self.config.use_conformer, "This architecture only supports a Conformer."
			
 
				         assert (
			
 
				             self.config.pos_encoder_type == "shaw_relative"
			
 
				         ), "This architecture only supports ShawRelativePositionSDPA."
			
 
				 
			
 
				+        self.device, self.dtype = device, dtype
			
 
				+
			
 
				     def build_sdpa(self) -> SDPA:
			
 
				         if self.config.shaw_rel_pos_sdpa_config is None:
			
 
				             raise ValueError(
			
@@ -157,7 +172,7 @@ class ConformerShawEncoderBuilder(Wav2Vec2EncoderBuilder):
 
				 
			
 
				 
			
 
				 def create_conformer_shaw_model(
			
 
				-    config: Wav2Vec2Config,
			
 
				+    config: ConformerShawConfig,
			
 
				     *,
			
 
				     device: Optional[Device] = None,
			
 
				     dtype: Optional[DataType] = None,
			
@@ -171,12 +186,12 @@ def create_conformer_shaw_model(
 
				     :param dtype:
			
 
				         The data type of module parameters and buffers.
			
 
				     """
			
 
				-    assert isinstance(config.encoder_config, ConformerShawEncoderConfig)
			
 
				-
			
 
				     encoder_builder = ConformerShawEncoderBuilder(
			
 
				         config.encoder_config, device=device, dtype=dtype
			
 
				     )
			
 
				 
			
 
				-    builder = Wav2Vec2Builder(config, encoder_builder, device=device, dtype=dtype)
			
 
				+    builder = Wav2Vec2Builder(
			
 
				+        CONFORMER_SHAW_FAMILY, config, encoder_builder, device=device, dtype=dtype
			
 
				+    )
			
 
				 
			
 
				     return builder.build_model()
			
--- a/src/seamless_communication/models/conformer_shaw/loader.py
+++ b/src/seamless_communication/models/conformer_shaw/loader.py
@@ -4,25 +4,25 @@
 
				 # This source code is licensed under the license found in the
			
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				-from typing import Any, Mapping
			
 
				+from typing import Any, Dict
			
 
				 
			
 
				 import torch
			
 
				 
			
 
				-from fairseq2.assets import asset_store, download_manager
			
 
				-from fairseq2.models.utils import ModelLoader
			
 
				+from fairseq2.models import setup_model_family
			
 
				 from fairseq2.models.utils.checkpoint import convert_fairseq_checkpoint
			
 
				-from fairseq2.models.wav2vec2.builder import Wav2Vec2Config
			
 
				-from fairseq2.models.wav2vec2.loader import load_wav2vec2_config
			
 
				-from fairseq2.models.wav2vec2.model import Wav2Vec2Model
			
 
				+from fairseq2.models.wav2vec2 import Wav2Vec2Model
			
 
				 
			
 
				 from seamless_communication.models.conformer_shaw.builder import (
			
 
				+    CONFORMER_SHAW_FAMILY,
			
 
				+    ConformerShawConfig,
			
 
				+    conformer_shaw_archs,
			
 
				     create_conformer_shaw_model,
			
 
				 )
			
 
				 
			
 
				 
			
 
				 def convert_conformer_shaw_checkpoint(
			
 
				-    checkpoint: Mapping[str, Any], config: Wav2Vec2Config
			
 
				-) -> Mapping[str, Any]:
			
 
				+    checkpoint: Dict[str, Any], config: ConformerShawConfig
			
 
				+) -> Dict[str, Any]:
			
 
				     """Convert a fairseq conformer shaw checkpoint to fairseq2."""
			
 
				     state_dict = checkpoint["model"]
			
 
				 
			
@@ -73,10 +73,10 @@ def convert_conformer_shaw_checkpoint(
 
				     return convert_fairseq_checkpoint(checkpoint, key_map)
			
 
				 
			
 
				 
			
 
				-load_conformer_shaw_model = ModelLoader[Wav2Vec2Model, Wav2Vec2Config](
			
 
				-    asset_store,
			
 
				-    download_manager,
			
 
				-    load_wav2vec2_config,
			
 
				+load_conformer_shaw_model, load_conformer_shaw_config = setup_model_family(
			
 
				+    CONFORMER_SHAW_FAMILY,
			
 
				+    ConformerShawConfig,
			
 
				     create_conformer_shaw_model,
			
 
				+    conformer_shaw_archs,
			
 
				     convert_conformer_shaw_checkpoint,
			
 
				 )
			
--- a/src/seamless_communication/models/generator/builder.py
+++ b/src/seamless_communication/models/generator/builder.py
@@ -8,7 +8,7 @@ from dataclasses import dataclass
 
				 from typing import Any, Dict, List, Literal, Optional, Tuple
			
 
				 
			
 
				 from fairseq2.data import VocabularyInfo
			
 
				-from fairseq2.models.utils.arch_registry import ArchitectureRegistry
			
 
				+from fairseq2.models.architecture_registry import ModelArchitectureRegistry
			
 
				 from fairseq2.nn.embedding import StandardEmbedding, init_scaled_embedding
			
 
				 from fairseq2.nn.position_encoder import SinusoidalPositionEncoder
			
 
				 from fairseq2.nn.projection import Linear
			
@@ -110,8 +110,7 @@ class VocoderConfig:
 
				     gcmvn_stats: Dict[str, List]  # type: ignore[type-arg]
			
 
				 
			
 
				 
			
 
				-vocoder_archs = ArchitectureRegistry[VocoderConfig]("vocoder_pretssel")
			
 
				-
			
 
				+vocoder_archs = ModelArchitectureRegistry[VocoderConfig]()
			
 
				 
			
 
				 vocoder_arch = vocoder_archs.decorator
			
 
				 
			
--- a/src/seamless_communication/models/generator/ecapa_tdnn_builder.py
+++ b/src/seamless_communication/models/generator/ecapa_tdnn_builder.py
@@ -7,7 +7,7 @@
 
				 from dataclasses import dataclass
			
 
				 from typing import List, Optional
			
 
				 
			
 
				-from fairseq2.models.utils.arch_registry import ArchitectureRegistry
			
 
				+from fairseq2.models.architecture_registry import ModelArchitectureRegistry
			
 
				 from fairseq2.typing import DataType, Device
			
 
				 
			
 
				 from seamless_communication.models.generator.ecapa_tdnn import ECAPA_TDNN
			
@@ -27,7 +27,7 @@ class EcapaTDNNConfig:
 
				     input_dim: int
			
 
				 
			
 
				 
			
 
				-ecapa_tdnn_archs = ArchitectureRegistry[EcapaTDNNConfig]("ecapa_tdnn")
			
 
				+ecapa_tdnn_archs = ModelArchitectureRegistry[EcapaTDNNConfig]()
			
 
				 
			
 
				 ecapa_tdnn_arch = ecapa_tdnn_archs.decorator
			
 
				 
			
--- a/src/seamless_communication/models/generator/loader.py
+++ b/src/seamless_communication/models/generator/loader.py
@@ -5,25 +5,19 @@
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				 
			
 
				-from typing import Any, Mapping
			
 
				-
			
 
				-from fairseq2.assets import asset_store, download_manager
			
 
				-from fairseq2.models.utils import ConfigLoader, ModelLoader
			
 
				+from fairseq2.models import setup_model_family
			
 
				 
			
 
				+from seamless_communication.models.generator.vocoder import PRETSSEL_VOCODER_FAMILY
			
 
				 from seamless_communication.models.generator.builder import (
			
 
				     VocoderConfig,
			
 
				     create_vocoder_model,
			
 
				     vocoder_archs,
			
 
				 )
			
 
				-from seamless_communication.models.generator.vocoder import PretsselVocoder
			
 
				-
			
 
				-load_pretssel_vocoder_config = ConfigLoader[VocoderConfig](asset_store, vocoder_archs)
			
 
				-
			
 
				 
			
 
				-load_pretssel_vocoder_model = ModelLoader[PretsselVocoder, VocoderConfig](
			
 
				-    asset_store,
			
 
				-    download_manager,
			
 
				-    load_pretssel_vocoder_config,
			
 
				+load_pretssel_vocoder_model, load_pretssel_vocoder_config = setup_model_family(
			
 
				+    PRETSSEL_VOCODER_FAMILY,
			
 
				+    VocoderConfig,
			
 
				     create_vocoder_model,
			
 
				+    vocoder_archs,
			
 
				     restrict_checkpoints=False,
			
 
				 )
			
--- a/src/seamless_communication/models/generator/streamable.py
+++ b/src/seamless_communication/models/generator/streamable.py
@@ -6,7 +6,7 @@
 
				 
			
 
				 import math
			
 
				 import warnings
			
 
				-from typing import Any, Dict, List, Literal, Optional, Tuple, TypeVar
			
 
				+from typing import Any, Dict, List, Literal, Optional, Tuple
			
 
				 
			
 
				 import torch
			
 
				 from fairseq2.typing import DataType, Device
			
--- a/src/seamless_communication/models/generator/vocoder.py
+++ b/src/seamless_communication/models/generator/vocoder.py
@@ -4,10 +4,11 @@
 
				 # This source code is licensed under the license found in the
			
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				-from typing import Any, Dict, List, Literal, Optional, Tuple
			
 
				+from typing import Any, Dict, Final, List, Literal, Optional, Tuple
			
 
				 
			
 
				 import torch
			
 
				 import torch.nn.functional as F
			
 
				+from fairseq2.models import Model
			
 
				 from fairseq2.nn.embedding import Embedding, StandardEmbedding
			
 
				 from fairseq2.nn.padding import PaddingMask
			
 
				 from fairseq2.nn.position_encoder import PositionEncoder
			
@@ -44,6 +45,9 @@ from .streamable import (
 
				     StreamableResnetBlock,
			
 
				 )
			
 
				 
			
 
				+
			
 
				+PRETSSEL_VOCODER_FAMILY: Final = "vocoder_pretssel"
			
 
				+
			
 
				 ELU_PARAMS: Dict[str, Any] = {"alpha": 1.0}
			
 
				 
			
 
				 
			
@@ -162,7 +166,7 @@ class PretsselDecoderFrontend(Module):
 
				         return seqs, padding_mask
			
 
				 
			
 
				 
			
 
				-class PretsselVocoder(Module):
			
 
				+class PretsselVocoder(Model):
			
 
				     """The expressivity-preserving vocoder"""
			
 
				 
			
 
				     encoder_frontend: PretsselEncoderFrontend
			
@@ -212,7 +216,7 @@ class PretsselVocoder(Module):
 
				         device: Optional[Device] = None,
			
 
				         dtype: Optional[DataType] = None,
			
 
				     ):
			
 
				-        super().__init__()
			
 
				+        super().__init__(PRETSSEL_VOCODER_FAMILY)
			
 
				         self.encoder_frontend = encoder_frontend
			
 
				         self.encoder = encoder
			
 
				         self.decoder_frontend = decoder_frontend
			
--- a/src/seamless_communication/models/monotonic_decoder/builder.py
+++ b/src/seamless_communication/models/monotonic_decoder/builder.py
@@ -12,7 +12,7 @@ from fairseq2.models.transformer import (
 
				     TransformerEmbeddingFrontend,
			
 
				     TransformerFrontend,
			
 
				 )
			
 
				-from fairseq2.models.utils.arch_registry import ArchitectureRegistry
			
 
				+from fairseq2.models.architecture_registry import ModelArchitectureRegistry
			
 
				 from fairseq2.nn.embedding import Embedding, StandardEmbedding, init_scaled_embedding
			
 
				 from fairseq2.nn.position_encoder import SinusoidalPositionEncoder
			
 
				 from fairseq2.nn.projection import TiedProjection
			
@@ -77,9 +77,7 @@ class MonotonicDecoderConfig:
 
				     in the PChooseLayer."""
			
 
				 
			
 
				 
			
 
				-monotonic_decoder_archs = ArchitectureRegistry[MonotonicDecoderConfig](
			
 
				-    "monotonic_decoder"
			
 
				-)
			
 
				+monotonic_decoder_archs = ModelArchitectureRegistry[MonotonicDecoderConfig]()
			
 
				 
			
 
				 monotonic_decoder_arch = monotonic_decoder_archs.decorator
			
 
				 
			
--- a/src/seamless_communication/models/monotonic_decoder/loader.py
+++ b/src/seamless_communication/models/monotonic_decoder/loader.py
@@ -4,11 +4,10 @@
 
				 # This source code is licensed under the license found in the
			
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				-from typing import Any, Mapping
			
 
				+from typing import Any, Dict
			
 
				 
			
 
				 import torch
			
 
				-from fairseq2.assets import asset_store, download_manager
			
 
				-from fairseq2.models.utils import ConfigLoader, ModelLoader
			
 
				+from fairseq2.models import setup_model_family
			
 
				 from fairseq2.models.utils.checkpoint import convert_fairseq_checkpoint
			
 
				 
			
 
				 from seamless_communication.models.monotonic_decoder.builder import (
			
@@ -16,12 +15,12 @@ from seamless_communication.models.monotonic_decoder.builder import (
 
				     create_monotonic_decoder_model,
			
 
				     monotonic_decoder_archs,
			
 
				 )
			
 
				-from seamless_communication.models.monotonic_decoder.model import MonotonicDecoderModel
			
 
				+from seamless_communication.models.monotonic_decoder.model import MONOTONIC_DECODER_FAMILY
			
 
				 
			
 
				 
			
 
				 def convert_monotonic_checkpoint(
			
 
				-    checkpoint: Mapping[str, Any], config: MonotonicDecoderConfig
			
 
				-) -> Mapping[str, Any]:
			
 
				+    checkpoint: Dict[str, Any], config: MonotonicDecoderConfig
			
 
				+) -> Dict[str, Any]:
			
 
				     state_dict = checkpoint["model"]
			
 
				 
			
 
				     # Check if we have a fairseq2 checkpoint.
			
@@ -75,18 +74,11 @@ def convert_monotonic_checkpoint(
 
				     return checkpoint
			
 
				 
			
 
				 
			
 
				-load_monotonic_decoder_config = ConfigLoader[MonotonicDecoderConfig](
			
 
				-    asset_store, monotonic_decoder_archs
			
 
				-)
			
 
				-
			
 
				-
			
 
				-load_monotonic_decoder_model = ModelLoader[
			
 
				-    MonotonicDecoderModel, MonotonicDecoderConfig
			
 
				-](
			
 
				-    asset_store,
			
 
				-    download_manager,
			
 
				-    load_monotonic_decoder_config,
			
 
				+load_monotonic_decoder_model, load_monotonic_decoder_config = setup_model_family(
			
 
				+    MONOTONIC_DECODER_FAMILY,
			
 
				+    MonotonicDecoderConfig,
			
 
				     create_monotonic_decoder_model,
			
 
				+    monotonic_decoder_archs,
			
 
				     convert_monotonic_checkpoint,
			
 
				     restrict_checkpoints=False,
			
 
				 )
			
--- a/src/seamless_communication/models/monotonic_decoder/model.py
+++ b/src/seamless_communication/models/monotonic_decoder/model.py
@@ -4,23 +4,26 @@
 
				 # This source code is licensed under the license found in the
			
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				-from typing import Optional, Tuple, final
			
 
				+from typing import Final, Optional, Tuple, final
			
 
				 
			
 
				+from fairseq2.models import Model
			
 
				 from fairseq2.models.transformer.frontend import TransformerFrontend
			
 
				 from fairseq2.nn.incremental_state import IncrementalStateBag
			
 
				 from fairseq2.nn.padding import PaddingMask
			
 
				 from fairseq2.nn.projection import Projection
			
 
				-from overrides import final as finaloverride
			
 
				+from overrides import final as override
			
 
				 from torch import Tensor
			
 
				-from torch.nn import Module
			
 
				 
			
 
				 from seamless_communication.models.monotonic_decoder.monotonic_decoder import (
			
 
				     MonotonicTransformerDecoder,
			
 
				 )
			
 
				 
			
 
				 
			
 
				+MONOTONIC_DECODER_FAMILY: Final = "monotonic_decoder"
			
 
				+
			
 
				+
			
 
				 @final
			
 
				-class MonotonicDecoderModel(Module):
			
 
				+class MonotonicDecoderModel(Model):
			
 
				     text_decoder_frontend: TransformerFrontend
			
 
				     text_decoder: MonotonicTransformerDecoder
			
 
				     final_proj: Projection
			
@@ -31,13 +34,13 @@ class MonotonicDecoderModel(Module):
 
				         text_decoder: MonotonicTransformerDecoder,
			
 
				         final_proj: Projection,
			
 
				     ) -> None:
			
 
				-        super().__init__()
			
 
				+        super().__init__(MONOTONIC_DECODER_FAMILY)
			
 
				 
			
 
				         self.text_decoder_frontend = text_decoder_frontend
			
 
				         self.text_decoder = text_decoder
			
 
				         self.final_proj = final_proj
			
 
				 
			
 
				-    @finaloverride
			
 
				+    @override
			
 
				     def decode(
			
 
				         self,
			
 
				         seqs: Tensor,
			
@@ -59,7 +62,7 @@ class MonotonicDecoderModel(Module):
 
				             state_bag=state_bag,
			
 
				         )
			
 
				 
			
 
				-    @finaloverride
			
 
				+    @override
			
 
				     def project(self, decoder_output: Tensor) -> Tensor:
			
 
				         logits = self.final_proj(decoder_output)
			
 
				 
			
--- a/src/seamless_communication/models/monotonic_decoder/monotonic_decoder.py
+++ b/src/seamless_communication/models/monotonic_decoder/monotonic_decoder.py
@@ -16,7 +16,7 @@ from fairseq2.nn.transformer import (
 
				     CausalAttentionMaskFactory,
			
 
				     create_standard_layer_norm,
			
 
				 )
			
 
				-from fairseq2.typing import DataType, Device, finaloverride
			
 
				+from fairseq2.typing import DataType, Device, override
			
 
				 from torch import Tensor
			
 
				 from torch.nn import Module
			
 
				 
			
@@ -62,7 +62,7 @@ class MonotonicTransformerDecoder(Module):
 
				             self.model_dim, device=device, dtype=dtype
			
 
				         )
			
 
				 
			
 
				-    @finaloverride
			
 
				+    @override
			
 
				     def forward(
			
 
				         self,
			
 
				         seqs: Tensor,
			
--- a/src/seamless_communication/models/monotonic_decoder/monotonic_decoder_layer.py
+++ b/src/seamless_communication/models/monotonic_decoder/monotonic_decoder_layer.py
@@ -15,7 +15,7 @@ from fairseq2.nn.transformer import (
 
				     MultiheadAttention,
			
 
				     create_standard_layer_norm,
			
 
				 )
			
 
				-from fairseq2.typing import DataType, Device, finaloverride
			
 
				+from fairseq2.typing import DataType, Device, override
			
 
				 from torch import Tensor
			
 
				 from torch.nn import Dropout, Module
			
 
				 
			
@@ -104,7 +104,7 @@ class MonotonicTransformerDecoderLayer(Module):
 
				         else:
			
 
				             self.register_module("ffn_dropout", None)
			
 
				 
			
 
				-    @finaloverride
			
 
				+    @override
			
 
				     def forward(
			
 
				         self,
			
 
				         seqs: Tensor,
			
--- a/src/seamless_communication/models/monotonic_decoder/p_choose.py
+++ b/src/seamless_communication/models/monotonic_decoder/p_choose.py
@@ -8,7 +8,7 @@ from typing import Optional, final
 
				 
			
 
				 import torch
			
 
				 from fairseq2.nn.projection import Linear
			
 
				-from fairseq2.typing import DataType, Device, finaloverride
			
 
				+from fairseq2.typing import DataType, Device, override
			
 
				 from torch import Tensor
			
 
				 from torch.nn import AvgPool1d, Module, ModuleList, ReLU
			
 
				 from torch.nn.parameter import Parameter
			
@@ -116,7 +116,7 @@ class PChooseLayer(Module):
 
				             ceil_mode=True,
			
 
				         )
			
 
				 
			
 
				-    @finaloverride
			
 
				+    @override
			
 
				     def forward(self, seqs: Tensor, keys: Tensor) -> Tensor:
			
 
				         q = self.q_energy_proj(seqs)
			
 
				 
			
--- a/src/seamless_communication/models/pretssel/ecapa_tdnn_builder.py
+++ b/src/seamless_communication/models/pretssel/ecapa_tdnn_builder.py
@@ -7,7 +7,7 @@
 
				 from dataclasses import dataclass
			
 
				 from typing import List, Optional
			
 
				 
			
 
				-from fairseq2.models.utils.arch_registry import ArchitectureRegistry
			
 
				+from fairseq2.models.architecture_registry import ModelArchitectureRegistry
			
 
				 from fairseq2.typing import DataType, Device
			
 
				 
			
 
				 from seamless_communication.models.pretssel.ecapa_tdnn import ECAPA_TDNN
			
@@ -27,7 +27,7 @@ class EcapaTDNNConfig:
 
				     input_dim: int
			
 
				 
			
 
				 
			
 
				-ecapa_tdnn_archs = ArchitectureRegistry[EcapaTDNNConfig]("ecapa_tdnn")
			
 
				+ecapa_tdnn_archs = ModelArchitectureRegistry[EcapaTDNNConfig]()
			
 
				 
			
 
				 ecapa_tdnn_arch = ecapa_tdnn_archs.decorator
			
 
				 
			
--- a/src/seamless_communication/models/tokenizer.py
+++ b/src/seamless_communication/models/tokenizer.py
@@ -4,32 +4,26 @@
 
				 # This source code is licensed under the BSD-style license found in the
			
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				+from pathlib import Path
			
 
				 from typing import Optional, Sequence, Set, final
			
 
				 
			
 
				 from fairseq2.data.text import (
			
 
				-    SentencePieceDecoder,
			
 
				+    SentencePieceTokenizer,
			
 
				     SentencePieceEncoder,
			
 
				-    SentencePieceModel,
			
 
				-    TextTokenDecoder,
			
 
				-    TextTokenEncoder,
			
 
				-    TextTokenizer,
			
 
				-    vocab_info_from_sentencepiece,
			
 
				 )
			
 
				-from fairseq2.data.typing import PathLike
			
 
				-from fairseq2.typing import Device, finaloverride
			
 
				+from fairseq2.typing import Device, override
			
 
				 
			
 
				 
			
 
				 @final
			
 
				-class SPMTokenizer(TextTokenizer):
			
 
				+class SPMTokenizer(SentencePieceTokenizer):
			
 
				     """Represents standard SPM-based tokenizer used in MT tasks"""
			
 
				 
			
 
				-    model: SentencePieceModel
			
 
				     langs: Set[str]
			
 
				     prepend_target_langtok_to_target: bool
			
 
				 
			
 
				     def __init__(
			
 
				         self,
			
 
				-        pathname: PathLike,
			
 
				+        path: Path,
			
 
				         langs: Sequence[str],
			
 
				         prepend_target_langtok_to_target: bool = True,
			
 
				     ) -> None:
			
@@ -41,20 +35,19 @@ class SPMTokenizer(TextTokenizer):
 
				         :param default_lang:
			
 
				             The fall-back language if no language is specified.
			
 
				         """
			
 
				-        self.langs = set(langs)
			
 
				-        self.prepend_target_langtok_to_target = prepend_target_langtok_to_target
			
 
				-
			
 
				         # Each language is represented by a `__lang__` control symbol.
			
 
				         control_symbols = [self._lang_tok_to_internal(lang) for lang in sorted(langs)]
			
 
				-        self.model = SentencePieceModel(pathname, control_symbols)
			
 
				-        vocab_info = vocab_info_from_sentencepiece(self.model)
			
 
				-        super().__init__(vocab_info)
			
 
				+
			
 
				+        super().__init__(path, control_symbols)
			
 
				+
			
 
				+        self.langs = set(langs)
			
 
				+        self.prepend_target_langtok_to_target = prepend_target_langtok_to_target
			
 
				 
			
 
				     @classmethod
			
 
				     def _lang_tok_to_internal(cls, lang: str) -> str:
			
 
				         return f"__{lang}__"
			
 
				 
			
 
				-    @finaloverride
			
 
				+    @override
			
 
				     def create_encoder(
			
 
				         self,
			
 
				         *,
			
@@ -63,7 +56,7 @@ class SPMTokenizer(TextTokenizer):
 
				         mode: Optional[str] = None,
			
 
				         device: Optional[Device] = None,
			
 
				         pin_memory: bool = False,
			
 
				-    ) -> TextTokenEncoder:
			
 
				+    ) -> SentencePieceEncoder:
			
 
				         """Create a token encoder.
			
 
				 
			
 
				         :param task:
			
@@ -110,13 +103,3 @@ class SPMTokenizer(TextTokenizer):
 
				             device=device,
			
 
				             pin_memory=pin_memory,
			
 
				         )
			
 
				-
			
 
				-    @finaloverride
			
 
				-    def create_raw_encoder(
			
 
				-        self, *, device: Optional[Device] = None, pin_memory: bool = False
			
 
				-    ) -> TextTokenEncoder:
			
 
				-        return SentencePieceEncoder(self.model, device=device, pin_memory=pin_memory)
			
 
				-
			
 
				-    @finaloverride
			
 
				-    def create_decoder(self) -> TextTokenDecoder:
			
 
				-        return SentencePieceDecoder(self.model)
			
--- a/src/seamless_communication/models/unit_extractor/wav2vec2_layer_output.py
+++ b/src/seamless_communication/models/unit_extractor/wav2vec2_layer_output.py
@@ -63,8 +63,10 @@ def _xlsr2_1b_v2() -> Wav2Vec2Config:
 
				         final_proj_bias=True,
			
 
				         temporal_mask_span_len=10,
			
 
				         max_temporal_mask_prob=0.65,
			
 
				+        min_num_temporal_mask_spans=2,
			
 
				         spatial_mask_span_len=10,
			
 
				         max_spatial_mask_prob=0.0,
			
 
				+        min_num_spatial_mask_spans=2,
			
 
				         quantized_dim=1024,
			
 
				         num_codebooks=2,
			
 
				         num_codebook_entries=320,
			
--- a/src/seamless_communication/models/unity/builder.py
+++ b/src/seamless_communication/models/unity/builder.py
@@ -7,9 +7,10 @@
 
				 from dataclasses import dataclass
			
 
				 from typing import Optional, Union
			
 
				 
			
 
				+from fairseq2.data import VocabularyInfo
			
 
				 from fairseq2.models.conformer import ConformerBlock, ConformerConvolution
			
 
				 from fairseq2.models.nllb import NllbBuilder, NllbConfig, nllb_archs
			
 
				-from fairseq2.models.utils.arch_registry import ArchitectureRegistry
			
 
				+from fairseq2.models.architecture_registry import ModelArchitectureRegistry
			
 
				 from fairseq2.models.w2vbert import w2vbert_archs
			
 
				 from fairseq2.models.wav2vec2 import Wav2Vec2EncoderBuilder, Wav2Vec2EncoderConfig
			
 
				 from fairseq2.nn.projection import TiedProjection
			
@@ -36,7 +37,7 @@ from seamless_communication.models.unity.adaptor_block import (
 
				     UnitYEncoderAdaptor,
			
 
				     UnitYTransformerAdaptorLayer,
			
 
				 )
			
 
				-from seamless_communication.models.unity.model import UnitYModel
			
 
				+from seamless_communication.models.unity.model import UNITY_FAMILY, UnitYModel
			
 
				 from seamless_communication.models.unity.t2u_builder import (
			
 
				     UnitYNART2UBuilder,
			
 
				     UnitYT2UBuilder,
			
@@ -100,7 +101,7 @@ class UnitYConfig:
 
				     """The dropout probability in Transformer layers of the adaptor block."""
			
 
				 
			
 
				 
			
 
				-unity_archs = ArchitectureRegistry[UnitYConfig]("unity")
			
 
				+unity_archs = ModelArchitectureRegistry[UnitYConfig]()
			
 
				 
			
 
				 unity_arch = unity_archs.decorator
			
 
				 
			
@@ -111,7 +112,15 @@ def _base() -> UnitYConfig:
 
				 
			
 
				     mt_model_config: NllbConfig = nllb_archs.get_config("dense_1b")
			
 
				 
			
 
				-    mt_model_config.vocab_info.size = 256102  # NLLB-100
			
 
				+    vocab_info = mt_model_config.vocab_info
			
 
				+
			
 
				+    mt_model_config.vocab_info = VocabularyInfo(
			
 
				+        size=256102,  # NLLB-100
			
 
				+        unk_idx=vocab_info.unk_idx,
			
 
				+        bos_idx=vocab_info.bos_idx,
			
 
				+        eos_idx=vocab_info.eos_idx,
			
 
				+        pad_idx=vocab_info.pad_idx,
			
 
				+    )
			
 
				 
			
 
				     t2u_config = unity_t2u_archs.get_config("base")
			
 
				 
			
@@ -139,7 +148,15 @@ def _medium() -> UnitYConfig:
 
				 
			
 
				     mt_model_config: NllbConfig = nllb_archs.get_config("dense_600m")
			
 
				 
			
 
				-    mt_model_config.vocab_info.size = 256206  # NLLB-200
			
 
				+    vocab_info = mt_model_config.vocab_info
			
 
				+
			
 
				+    mt_model_config.vocab_info = VocabularyInfo(
			
 
				+        size=256206,  # NLLB-200
			
 
				+        unk_idx=vocab_info.unk_idx,
			
 
				+        bos_idx=vocab_info.bos_idx,
			
 
				+        eos_idx=vocab_info.eos_idx,
			
 
				+        pad_idx=vocab_info.pad_idx,
			
 
				+    )
			
 
				 
			
 
				     t2u_config = unity_t2u_archs.get_config("medium")
			
 
				 
			
@@ -163,11 +180,19 @@ def _medium() -> UnitYConfig:
 
				 
			
 
				 @unity_arch("base_v2")
			
 
				 def _base_v2() -> UnitYConfig:
			
 
				-    conformer_shaw_encoder_config = conformer_shaw_archs.get_config("600m")
			
 
				+    conformer_shaw_config = conformer_shaw_archs.get_config("conformer_shaw_600m")
			
 
				 
			
 
				     mt_model_config: NllbConfig = nllb_archs.get_config("dense_1b")
			
 
				 
			
 
				-    mt_model_config.vocab_info.size = 256102  # NLLB-100
			
 
				+    vocab_info = mt_model_config.vocab_info
			
 
				+
			
 
				+    mt_model_config.vocab_info = VocabularyInfo(
			
 
				+        size=256102,  # NLLB-100
			
 
				+        unk_idx=vocab_info.unk_idx,
			
 
				+        bos_idx=vocab_info.bos_idx,
			
 
				+        eos_idx=vocab_info.eos_idx,
			
 
				+        pad_idx=vocab_info.pad_idx,
			
 
				+    )
			
 
				 
			
 
				     mt_model_config.max_seq_len = 4096
			
 
				 
			
@@ -175,7 +200,7 @@ def _base_v2() -> UnitYConfig:
 
				 
			
 
				     return UnitYConfig(
			
 
				         model_dim=1024,
			
 
				-        w2v2_encoder_config=conformer_shaw_encoder_config,
			
 
				+        w2v2_encoder_config=conformer_shaw_config.encoder_config,
			
 
				         mt_model_config=mt_model_config,
			
 
				         t2u_config=t2u_config,
			
 
				         prosody_encoder_config=None,
			
@@ -193,11 +218,19 @@ def _base_v2() -> UnitYConfig:
 
				 
			
 
				 @unity_arch("expressivity_v2")
			
 
				 def _expressivity_v2() -> UnitYConfig:
			
 
				-    conformer_shaw_encoder_config = conformer_shaw_archs.get_config("600m")
			
 
				+    conformer_shaw_config = conformer_shaw_archs.get_config("conformer_shaw_600m")
			
 
				 
			
 
				     mt_model_config: NllbConfig = nllb_archs.get_config("dense_1b")
			
 
				 
			
 
				-    mt_model_config.vocab_info.size = 256102  # NLLB-100
			
 
				+    vocab_info = mt_model_config.vocab_info
			
 
				+
			
 
				+    mt_model_config.vocab_info = VocabularyInfo(
			
 
				+        size=256102,  # NLLB-100
			
 
				+        unk_idx=vocab_info.unk_idx,
			
 
				+        bos_idx=vocab_info.bos_idx,
			
 
				+        eos_idx=vocab_info.eos_idx,
			
 
				+        pad_idx=vocab_info.pad_idx,
			
 
				+    )
			
 
				 
			
 
				     mt_model_config.max_seq_len = 10000
			
 
				 
			
@@ -207,7 +240,7 @@ def _expressivity_v2() -> UnitYConfig:
 
				 
			
 
				     return UnitYConfig(
			
 
				         model_dim=1024,
			
 
				-        w2v2_encoder_config=conformer_shaw_encoder_config,
			
 
				+        w2v2_encoder_config=conformer_shaw_config.encoder_config,
			
 
				         mt_model_config=mt_model_config,
			
 
				         t2u_config=t2u_config,
			
 
				         prosody_encoder_config=prosody_encoder_config,
			
@@ -263,19 +296,19 @@ class UnitYBuilder:
 
				         :param dtype:
			
 
				             The data type of module parameters and buffers.
			
 
				         """
			
 
				-        if w2v2_encoder_builder.config.model_dim != config.model_dim:
			
 
				+        if config.w2v2_encoder_config.model_dim != config.model_dim:
			
 
				             raise ValueError(
			
 
				-                f"`model_dim` and `model_dim` of `w2v2_encoder_builder.config` must be equal, but are {config.model_dim} and {w2v2_encoder_builder.config.model_dim} instead."
			
 
				+                f"`config.model_dim` and `config.w2v2_encoder_config.model_dim` must be equal, but are {config.model_dim} and {config.w2v2_encoder_config.model_dim} instead."
			
 
				             )
			
 
				 
			
 
				-        if mt_model_builder.config.model_dim != config.model_dim:
			
 
				+        if config.mt_model_config.model_dim != config.model_dim:
			
 
				             raise ValueError(
			
 
				-                f"`model_dim` and `model_dim` of `mt_model_builder.config` must be equal, but are {config.model_dim} and {mt_model_builder.config.model_dim} instead."
			
 
				+                f"`config.model_dim` and `config.mt_model_config.model_dim` must be equal, but are {config.model_dim} and {config.mt_model_config.model_dim} instead."
			
 
				             )
			
 
				 
			
 
				-        if t2u_builder is not None and t2u_builder.config.model_dim != config.model_dim:
			
 
				+        if config.t2u_config is not None and config.t2u_config.model_dim != config.model_dim:
			
 
				             raise ValueError(
			
 
				-                f"`model_dim` and `model_dim` of `t2u_builder.config` must be equal, but are {config.model_dim} and {t2u_builder.config.model_dim} instead."
			
 
				+                f"`config.model_dim` and `config.t2u_config.model_dim` must be equal, but are {config.model_dim} and {config.t2u_config.model_dim} instead."
			
 
				             )
			
 
				 
			
 
				         self.config = config
			
@@ -337,6 +370,7 @@ class UnitYBuilder:
 
				             text_decoder,
			
 
				             final_proj,
			
 
				             t2u_model,
			
 
				+            self.config.mt_model_config.max_seq_len or 0,
			
 
				             self.config.mt_model_config.vocab_info,
			
 
				             prosody_encoder_model,
			
 
				         )
			
@@ -367,12 +401,12 @@ class UnitYBuilder:
 
				     def build_adaptor_layer(self, idx: int) -> TransformerEncoderLayer:
			
 
				         """Build a Transformer-based encoder adaptor layer."""
			
 
				         self_attn = self.build_adaptor_attention(
			
 
				-            self.w2v2_encoder_builder.config.num_encoder_attn_heads
			
 
				+            self.config.w2v2_encoder_config.num_encoder_attn_heads
			
 
				         )
			
 
				 
			
 
				         ffn = StandardFeedForwardNetwork(
			
 
				             self.config.model_dim,
			
 
				-            self.w2v2_encoder_builder.config.ffn_inner_dim,
			
 
				+            self.config.w2v2_encoder_config.ffn_inner_dim,
			
 
				             inner_activation=GELU() if self.config.use_gelu else ReLU(),
			
 
				             bias=True,
			
 
				             device=self.device,
			
@@ -396,12 +430,12 @@ class UnitYBuilder:
 
				         # Empirically shown that, in adaptor layers, vanilla MHA performs better
			
 
				         # than MHA with relative positional encoding.
			
 
				         self_attn = self.build_adaptor_attention(
			
 
				-            self.w2v2_encoder_builder.config.num_encoder_attn_heads
			
 
				+            self.config.w2v2_encoder_config.num_encoder_attn_heads
			
 
				         )
			
 
				 
			
 
				         conv = ConformerConvolution(
			
 
				-            self.w2v2_encoder_builder.config.model_dim,
			
 
				-            self.w2v2_encoder_builder.config.depthwise_conv_kernel_size,
			
 
				+            self.config.w2v2_encoder_config.model_dim,
			
 
				+            self.config.w2v2_encoder_config.depthwise_conv_kernel_size,
			
 
				             device=self.device,
			
 
				             dtype=self.dtype,
			
 
				         )
			
@@ -446,13 +480,13 @@ class NllbWithGELUBuilder(NllbBuilder):
 
				     @override
			
 
				     def build_ffn(self) -> FeedForwardNetwork:
			
 
				         return StandardFeedForwardNetwork(
			
 
				-            self.config.model_dim,
			
 
				-            self.config.ffn_inner_dim,
			
 
				+            self._config.model_dim,
			
 
				+            self._config.ffn_inner_dim,
			
 
				             bias=True,
			
 
				             inner_activation=GELU(),
			
 
				             norm_order=TransformerNormOrder.PRE,
			
 
				-            device=self.device,
			
 
				-            dtype=self.dtype,
			
 
				+            device=self._device,
			
 
				+            dtype=self._dtype,
			
 
				         )
			
 
				 
			
 
				 
			
@@ -497,11 +531,11 @@ def create_unity_model(
 
				 
			
 
				     if config.use_gelu:
			
 
				         mt_model_builder: NllbBuilder = NllbWithGELUBuilder(
			
 
				-            config.mt_model_config, device=device, dtype=dtype
			
 
				+            UNITY_FAMILY, config.mt_model_config, device=device, dtype=dtype
			
 
				         )
			
 
				     else:
			
 
				         mt_model_builder = NllbBuilder(
			
 
				-            config.mt_model_config, device=device, dtype=dtype
			
 
				+            UNITY_FAMILY, config.mt_model_config, device=device, dtype=dtype
			
 
				         )
			
 
				 
			
 
				     unity_builder = UnitYBuilder(
			
--- a/src/seamless_communication/models/unity/char_tokenizer.py
+++ b/src/seamless_communication/models/unity/char_tokenizer.py
@@ -4,6 +4,7 @@
 
				 # This source code is licensed under the license found in the
			
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				+from pathlib import Path
			
 
				 from typing import Optional, Union, final
			
 
				 
			
 
				 from fairseq2.assets import (
			
@@ -14,36 +15,24 @@ from fairseq2.assets import (
 
				 )
			
 
				 from fairseq2.assets.card import AssetCard
			
 
				 from fairseq2.data.text import (
			
 
				-    SentencePieceDecoder,
			
 
				+    SentencePieceTokenizer,
			
 
				     SentencePieceEncoder,
			
 
				-    SentencePieceModel,
			
 
				-    TextTokenDecoder,
			
 
				-    TextTokenEncoder,
			
 
				-    TextTokenizer,
			
 
				-    vocab_info_from_sentencepiece,
			
 
				 )
			
 
				-from fairseq2.data.typing import PathLike
			
 
				-from fairseq2.typing import Device, finaloverride
			
 
				+from fairseq2.typing import Device, override
			
 
				 
			
 
				 
			
 
				 @final
			
 
				-class CharTokenizer(TextTokenizer):
			
 
				+class CharTokenizer(SentencePieceTokenizer):
			
 
				     """A character-level tokenizer used during non-autoregressive T2U decoding."""
			
 
				 
			
 
				-    model: SentencePieceModel
			
 
				-
			
 
				-    def __init__(self, pathname: PathLike) -> None:
			
 
				+    def __init__(self, path: Path) -> None:
			
 
				         """
			
 
				         :param pathname:
			
 
				             The pathname of the SentencePiece model file.
			
 
				         """
			
 
				-        self.model = SentencePieceModel(pathname)
			
 
				-
			
 
				-        vocab_info = vocab_info_from_sentencepiece(self.model)
			
 
				-
			
 
				-        super().__init__(vocab_info)
			
 
				+        super().__init__(path)
			
 
				 
			
 
				-    @finaloverride
			
 
				+    @override
			
 
				     def create_encoder(
			
 
				         self,
			
 
				         task: Optional[str] = None,
			
@@ -51,24 +40,10 @@ class CharTokenizer(TextTokenizer):
 
				         mode: Optional[str] = None,
			
 
				         device: Optional[Device] = None,
			
 
				         pin_memory: bool = False,
			
 
				-    ) -> TextTokenEncoder:
			
 
				+    ) -> SentencePieceEncoder:
			
 
				         """Creates a character level encoder."""
			
 
				-        return SentencePieceEncoder(
			
 
				-            self.model,
			
 
				-            device=device,
			
 
				-            pin_memory=pin_memory,
			
 
				-        )
			
 
				-
			
 
				-    @finaloverride
			
 
				-    def create_raw_encoder(
			
 
				-        self, *, device: Optional[Device] = None, pin_memory: bool = False
			
 
				-    ) -> TextTokenEncoder:
			
 
				         return SentencePieceEncoder(self.model, device=device, pin_memory=pin_memory)
			
 
				 
			
 
				-    @finaloverride
			
 
				-    def create_decoder(self) -> TextTokenDecoder:
			
 
				-        return SentencePieceDecoder(self.model)
			
 
				-
			
 
				 
			
 
				 class UnitYCharTokenizerLoader:
			
 
				     """Loads character-level tokenizers of UnitY models."""
			
--- a/src/seamless_communication/models/unity/fft_decoder.py
+++ b/src/seamless_communication/models/unity/fft_decoder.py
@@ -10,7 +10,7 @@ from fairseq2.nn.module_list import ModuleList
 
				 from fairseq2.nn.normalization import LayerNorm
			
 
				 from fairseq2.nn.padding import PaddingMask
			
 
				 from fairseq2.nn.transformer import TransformerNormOrder, create_standard_layer_norm
			
 
				-from fairseq2.typing import DataType, Device, finaloverride
			
 
				+from fairseq2.typing import DataType, Device, override
			
 
				 from torch import Tensor
			
 
				 from torch.nn import Module
			
 
				 
			
@@ -61,7 +61,7 @@ class FeedForwardTransformer(Module):
 
				 
			
 
				         self.norm_order = norm_order
			
 
				 
			
 
				-    @finaloverride
			
 
				+    @override
			
 
				     def forward(
			
 
				         self,
			
 
				         seqs: Tensor,
			
--- a/src/seamless_communication/models/unity/fft_decoder_layer.py
+++ b/src/seamless_communication/models/unity/fft_decoder_layer.py
@@ -9,7 +9,7 @@ from typing import Optional, Tuple, final
 
				 from fairseq2.nn.normalization import LayerNorm
			
 
				 from fairseq2.nn.padding import PaddingMask, apply_padding_mask
			
 
				 from fairseq2.nn.transformer import MultiheadAttention, create_standard_layer_norm
			
 
				-from fairseq2.typing import DataType, Device, finaloverride
			
 
				+from fairseq2.typing import DataType, Device, override
			
 
				 from torch import Tensor
			
 
				 from torch.nn import Conv1d, Dropout, Module, ReLU
			
 
				 
			
@@ -71,7 +71,7 @@ class Conv1dBlock(Module):
 
				             dtype=dtype,
			
 
				         )
			
 
				 
			
 
				-    @finaloverride
			
 
				+    @override
			
 
				     def forward(self, seqs: Tensor, padding_mask: Optional[PaddingMask]) -> Tensor:
			
 
				         # Ensure that we do not leak padded positions in the convolution layer.
			
 
				         seqs = apply_padding_mask(seqs, padding_mask)
			
@@ -173,7 +173,7 @@ class FeedForwardTransformerLayer(Module):
 
				         else:
			
 
				             self.register_module("film", None)
			
 
				 
			
 
				-    @finaloverride
			
 
				+    @override
			
 
				     def forward(
			
 
				         self,
			
 
				         seqs: Tensor,
			
--- a/src/seamless_communication/models/unity/loader.py
+++ b/src/seamless_communication/models/unity/loader.py
@@ -4,14 +4,14 @@
 
				 # This source code is licensed under the license found in the
			
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				-from typing import Any, Dict, List, Mapping, Tuple, Union
			
 
				+from typing import Any, Dict, List, Tuple, Union
			
 
				 
			
 
				 import torch
			
 
				-from fairseq2.assets import AssetStore, asset_store, download_manager
			
 
				+from fairseq2.assets import AssetStore, asset_store
			
 
				 from fairseq2.assets.card import AssetCard, AssetCardFieldNotFoundError
			
 
				-from fairseq2.models.nllb import NllbConfig
			
 
				-from fairseq2.models.nllb.loader import NllbTokenizerLoader
			
 
				-from fairseq2.models.utils import ConfigLoader, ModelLoader
			
 
				+from fairseq2.models.nllb import NllbConfig, load_nllb_tokenizer
			
 
				+from fairseq2.models import setup_model_family
			
 
				+from fairseq2.data.text import register_text_tokenizer
			
 
				 from fairseq2.models.utils.checkpoint import convert_fairseq_checkpoint
			
 
				 
			
 
				 from seamless_communication.models.unity.builder import (
			
@@ -20,13 +20,13 @@ from seamless_communication.models.unity.builder import (
 
				     unity_archs,
			
 
				 )
			
 
				 from seamless_communication.models.unity.char_tokenizer import load_unity_char_tokenizer
			
 
				-from seamless_communication.models.unity.model import UnitYModel
			
 
				+from seamless_communication.models.unity.model import UNITY_FAMILY
			
 
				 from seamless_communication.models.unity.unit_tokenizer import UnitTokenizer
			
 
				 
			
 
				 
			
 
				 def convert_unity_checkpoint(
			
 
				-    checkpoint: Mapping[str, Any], config: UnitYConfig
			
 
				-) -> Mapping[str, Any]:
			
 
				+    checkpoint: Dict[str, Any], config: UnitYConfig
			
 
				+) -> Dict[str, Any]:
			
 
				     state_dict = checkpoint["model"]
			
 
				 
			
 
				     # Check if we have a fairseq2 checkpoint.
			
@@ -39,7 +39,11 @@ def convert_unity_checkpoint(
 
				 
			
 
				     state_dict = checkpoint["model"]
			
 
				 
			
 
				-    keys_to_delete = []
			
 
				+    keys_to_delete = [
			
 
				+        "speech_encoder_frontend.pos_encoder.conv.bias",
			
 
				+        "speech_encoder_frontend.pos_encoder.conv.weight_g",
			
 
				+        "speech_encoder_frontend.pos_encoder.conv.weight_v",
			
 
				+    ]
			
 
				 
			
 
				     # ExpressiveUnitY model (from multi_arch codebase)
			
 
				     if config.prosody_encoder_config is not None:
			
@@ -203,42 +207,42 @@ def _fairseq_key_map(config: UnitYConfig) -> Dict[str, str]:
 
				         # fmt: off
			
 
				 
			
 
				         # Speech Encoder
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.pos_conv\.0\.":                                    r"speech_encoder_frontend.pos_encoder.conv.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.layer_norm\.":                                              r"speech_encoder_frontend.post_extract_layer_norm.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.post_extract_proj\.":                                       r"speech_encoder_frontend.model_dim_proj.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.feature_extractor\.conv_layers\.([0-9]+)\.0\.":             r"speech_encoder_frontend.feature_extractor.layers.\1.conv.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.feature_extractor\.conv_layers\.([0-9]+)\.2\.1\.":          r"speech_encoder_frontend.feature_extractor.layers.\1.layer_norm.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.feature_extractor\.conv_layers\.0\.2\.":                    r"speech_encoder_frontend.feature_extractor.layers.0.group_norm.",
			
 
				-
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.conv_module\.batch_norm\.":      r"speech_encoder.inner.layers.\1.conv.batch_norm.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.conv_module\.layer_norm2\.":     r"speech_encoder.inner.layers.\1.conv.layer_norm.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.conv_module\.depthwise_conv\.":  r"speech_encoder.inner.layers.\1.conv.depthwise_conv.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.conv_module\.layer_norm\.":      r"speech_encoder.inner.layers.\1.conv_layer_norm.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.conv_module\.pointwise_conv1\.": r"speech_encoder.inner.layers.\1.conv.pointwise_conv1.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.conv_module\.pointwise_conv2\.": r"speech_encoder.inner.layers.\1.conv.pointwise_conv2.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.ffn(1|2)\.layer_norm\.":         r"speech_encoder.inner.layers.\1.ffn\2_layer_norm.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.ffn(1|2)\.w_1\.":                r"speech_encoder.inner.layers.\1.ffn\2.inner_proj.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.ffn(1|2)\.w_2\.":                r"speech_encoder.inner.layers.\1.ffn\2.output_proj.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn_layer_norm\.":         r"speech_encoder.inner.layers.\1.self_attn_layer_norm.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.linear_q\.":          r"speech_encoder.inner.layers.\1.self_attn.q_proj.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.linear_k\.":          r"speech_encoder.inner.layers.\1.self_attn.k_proj.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.linear_v\.":          r"speech_encoder.inner.layers.\1.self_attn.v_proj.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.linear_out\.":        r"speech_encoder.inner.layers.\1.self_attn.output_proj.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.q_proj\.":            r"speech_encoder.inner.layers.\1.self_attn.q_proj.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.k_proj\.":            r"speech_encoder.inner.layers.\1.self_attn.k_proj.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.v_proj\.":            r"speech_encoder.inner.layers.\1.self_attn.v_proj.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.rel_k_embedding\.":   r"speech_encoder.inner.layers.\1.self_attn.sdpa.rel_k_embed.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.out_proj\.":          r"speech_encoder.inner.layers.\1.self_attn.output_proj.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.linear_pos\.":        r"speech_encoder.inner.layers.\1.self_attn.sdpa.r_proj.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.pos_bias_u":          r"speech_encoder.inner.layers.\1.self_attn.sdpa.u_bias",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.pos_bias_v":          r"speech_encoder.inner.layers.\1.self_attn.sdpa.v_bias",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.final_layer_norm\.":             r"speech_encoder.inner.layers.\1.layer_norm.",
			
 
				-        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layer_norm\.":                                     r"speech_encoder.inner.layer_norm.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.pos_conv\.0\.":                                    "speech_encoder_frontend.pos_encoder.conv.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.layer_norm\.":                                              "speech_encoder_frontend.post_extract_layer_norm.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.post_extract_proj\.":                                       "speech_encoder_frontend.model_dim_proj.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.feature_extractor\.conv_layers\.([0-9]+)\.0\.":             "speech_encoder_frontend.feature_extractor.layers.\\1.conv.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.feature_extractor\.conv_layers\.([0-9]+)\.2\.1\.":          "speech_encoder_frontend.feature_extractor.layers.\\1.layer_norm.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.feature_extractor\.conv_layers\.0\.2\.":                    "speech_encoder_frontend.feature_extractor.layers.0.group_norm.",
			
 
				+
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.conv_module\.batch_norm\.":      "speech_encoder.inner.layers.\\1.conv.batch_norm.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.conv_module\.layer_norm2\.":     "speech_encoder.inner.layers.\\1.conv.layer_norm.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.conv_module\.depthwise_conv\.":  "speech_encoder.inner.layers.\\1.conv.depthwise_conv.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.conv_module\.layer_norm\.":      "speech_encoder.inner.layers.\\1.conv_layer_norm.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.conv_module\.pointwise_conv1\.": "speech_encoder.inner.layers.\\1.conv.pointwise_conv1.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.conv_module\.pointwise_conv2\.": "speech_encoder.inner.layers.\\1.conv.pointwise_conv2.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.ffn(1|2)\.layer_norm\.":         "speech_encoder.inner.layers.\\1.ffn\\2_layer_norm.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.ffn(1|2)\.w_1\.":                "speech_encoder.inner.layers.\\1.ffn\\2.inner_proj.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.ffn(1|2)\.w_2\.":                "speech_encoder.inner.layers.\\1.ffn\\2.output_proj.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn_layer_norm\.":         "speech_encoder.inner.layers.\\1.self_attn_layer_norm.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.linear_q\.":          "speech_encoder.inner.layers.\\1.self_attn.q_proj.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.linear_k\.":          "speech_encoder.inner.layers.\\1.self_attn.k_proj.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.linear_v\.":          "speech_encoder.inner.layers.\\1.self_attn.v_proj.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.linear_out\.":        "speech_encoder.inner.layers.\\1.self_attn.output_proj.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.q_proj\.":            "speech_encoder.inner.layers.\\1.self_attn.q_proj.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.k_proj\.":            "speech_encoder.inner.layers.\\1.self_attn.k_proj.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.v_proj\.":            "speech_encoder.inner.layers.\\1.self_attn.v_proj.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.rel_k_embedding\.":   "speech_encoder.inner.layers.\\1.self_attn.sdpa.rel_k_embed.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.out_proj\.":          "speech_encoder.inner.layers.\\1.self_attn.output_proj.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.linear_pos\.":        "speech_encoder.inner.layers.\\1.self_attn.sdpa.r_proj.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.pos_bias_u":          "speech_encoder.inner.layers.\\1.self_attn.sdpa.u_bias",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.pos_bias_v":          "speech_encoder.inner.layers.\\1.self_attn.sdpa.v_bias",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.final_layer_norm\.":             "speech_encoder.inner.layers.\\1.layer_norm.",
			
 
				+        fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layer_norm\.":                                     "speech_encoder.inner.layer_norm.",
			
 
				 
			
 
				         # Speech Encoder Adaptor
			
 
				-        fr"^{encoder_key}\.adaptor\.proj\.0\.": r"speech_encoder.proj1.",
			
 
				-        fr"^{encoder_key}\.adaptor\.proj\.2\.": r"speech_encoder.proj2.",
			
 
				-        fr"^{encoder_key}\.adaptor\.out_ln\.":  r"speech_encoder.layer_norm.",
			
 
				+        fr"^{encoder_key}\.adaptor\.proj\.0\.": "speech_encoder.proj1.",
			
 
				+        fr"^{encoder_key}\.adaptor\.proj\.2\.": "speech_encoder.proj2.",
			
 
				+        fr"^{encoder_key}\.adaptor\.out_ln\.":  "speech_encoder.layer_norm.",
			
 
				 
			
 
				         # Text Encoder
			
 
				         r"^text_encoder\.embed_tokens\.":                              r"text_encoder_frontend.embed.",
			
@@ -264,13 +268,13 @@ def _fairseq_key_map(config: UnitYConfig) -> Dict[str, str]:
 
				     if config.w2v2_encoder_config.use_conformer:
			
 
				         key_map.update(
			
 
				             {
			
 
				-                fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layer_norm\.": r"speech_encoder.inner_layer_norm."
			
 
				+                fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layer_norm\.": "speech_encoder.inner_layer_norm."
			
 
				             }
			
 
				         )
			
 
				     else:
			
 
				         key_map.update(
			
 
				             {
			
 
				-                rf"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layer_norm\.": r"speech_encoder.inner.layer_norm."
			
 
				+                rf"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layer_norm\.": "speech_encoder.inner.layer_norm."
			
 
				             }
			
 
				         )
			
 
				     # fmt: on
			
@@ -279,20 +283,20 @@ def _fairseq_key_map(config: UnitYConfig) -> Dict[str, str]:
 
				         key_map.update(
			
 
				             {
			
 
				                 # fmt: off
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.self_attn\.out_proj\.":          r"speech_encoder.adaptor_layers.\1.block.self_attn.output_proj.",
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.self_attn\.":                    r"speech_encoder.adaptor_layers.\1.block.self_attn.",
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.self_attn_layer_norm\.":         r"speech_encoder.adaptor_layers.\1.block.self_attn_layer_norm.",
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.ffn(1|2)\.layer_norm\.":         r"speech_encoder.adaptor_layers.\1.block.ffn\2_layer_norm.",
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.ffn(1|2)\.w_1\.":                r"speech_encoder.adaptor_layers.\1.block.ffn\2.inner_proj.",
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.ffn(1|2)\.w_2\.":                r"speech_encoder.adaptor_layers.\1.block.ffn\2.output_proj.",
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.conv_module\.batch_norm\.":      r"speech_encoder.adaptor_layers.\1.block.conv.batch_norm.",
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.conv_module\.depthwise_conv\.":  r"speech_encoder.adaptor_layers.\1.block.conv.depthwise_conv.",
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.conv_module\.layer_norm\.":      r"speech_encoder.adaptor_layers.\1.block.conv_layer_norm.",
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.conv_module\.pointwise_conv1\.": r"speech_encoder.adaptor_layers.\1.block.conv.pointwise_conv1.",
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.conv_module\.pointwise_conv2\.": r"speech_encoder.adaptor_layers.\1.block.conv.pointwise_conv2.",
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.final_layer_norm\.":             r"speech_encoder.adaptor_layers.\1.block.layer_norm.",
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.conv_ln\.":                      r"speech_encoder.adaptor_layers.\1.layer_norm.",
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.conv_pool\.1\.":                 r"speech_encoder.adaptor_layers.\1.conv.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.self_attn\.out_proj\.":          "speech_encoder.adaptor_layers.\\1.block.self_attn.output_proj.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.self_attn\.":                    "speech_encoder.adaptor_layers.\\1.block.self_attn.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.self_attn_layer_norm\.":         "speech_encoder.adaptor_layers.\\1.block.self_attn_layer_norm.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.ffn(1|2)\.layer_norm\.":         "speech_encoder.adaptor_layers.\\1.block.ffn\\2_layer_norm.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.ffn(1|2)\.w_1\.":                "speech_encoder.adaptor_layers.\\1.block.ffn\\2.inner_proj.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.ffn(1|2)\.w_2\.":                "speech_encoder.adaptor_layers.\\1.block.ffn\\2.output_proj.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.conv_module\.batch_norm\.":      "speech_encoder.adaptor_layers.\\1.block.conv.batch_norm.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.conv_module\.depthwise_conv\.":  "speech_encoder.adaptor_layers.\\1.block.conv.depthwise_conv.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.conv_module\.layer_norm\.":      "speech_encoder.adaptor_layers.\\1.block.conv_layer_norm.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.conv_module\.pointwise_conv1\.": "speech_encoder.adaptor_layers.\\1.block.conv.pointwise_conv1.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.conv_module\.pointwise_conv2\.": "speech_encoder.adaptor_layers.\\1.block.conv.pointwise_conv2.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.final_layer_norm\.":             "speech_encoder.adaptor_layers.\\1.block.layer_norm.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.conv_ln\.":                      "speech_encoder.adaptor_layers.\\1.layer_norm.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.conv_pool\.1\.":                 "speech_encoder.adaptor_layers.\\1.conv.",
			
 
				                 # fmt: on
			
 
				             }
			
 
				         )
			
@@ -300,15 +304,15 @@ def _fairseq_key_map(config: UnitYConfig) -> Dict[str, str]:
 
				         key_map.update(
			
 
				             {
			
 
				                 # fmt: off
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.residual_layer_norm\.":  r"speech_encoder.adaptor_layers.\1.residual_layer_norm.",
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.residual_pool\.1\.":     r"speech_encoder.adaptor_layers.\1.residual_conv.",
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.attn_pool\.1\.":         r"speech_encoder.adaptor_layers.\1.self_attn_conv.",
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.self_attn\.out_proj\.":  r"speech_encoder.adaptor_layers.\1.self_attn.output_proj.",
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.self_attn\.":            r"speech_encoder.adaptor_layers.\1.self_attn.",
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.self_attn_layer_norm\.": r"speech_encoder.adaptor_layers.\1.self_attn_layer_norm.",
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.fc1\.":                  r"speech_encoder.adaptor_layers.\1.ffn.inner_proj.",
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.fc2\.":                  r"speech_encoder.adaptor_layers.\1.ffn.output_proj.",
			
 
				-                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.final_layer_norm\.":     r"speech_encoder.adaptor_layers.\1.ffn_layer_norm.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.residual_layer_norm\.":  "speech_encoder.adaptor_layers.\\1.residual_layer_norm.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.residual_pool\.1\.":     "speech_encoder.adaptor_layers.\\1.residual_conv.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.attn_pool\.1\.":         "speech_encoder.adaptor_layers.\\1.self_attn_conv.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.self_attn\.out_proj\.":  "speech_encoder.adaptor_layers.\\1.self_attn.output_proj.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.self_attn\.":            "speech_encoder.adaptor_layers.\\1.self_attn.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.self_attn_layer_norm\.": "speech_encoder.adaptor_layers.\\1.self_attn_layer_norm.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.fc1\.":                  "speech_encoder.adaptor_layers.\\1.ffn.inner_proj.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.fc2\.":                  "speech_encoder.adaptor_layers.\\1.ffn.output_proj.",
			
 
				+                fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.final_layer_norm\.":     "speech_encoder.adaptor_layers.\\1.ffn_layer_norm.",
			
 
				                 # fmt: on
			
 
				             }
			
 
				         )
			
@@ -389,20 +393,18 @@ def _fairseq_key_map(config: UnitYConfig) -> Dict[str, str]:
 
				     return key_map
			
 
				 
			
 
				 
			
 
				-load_unity_config = ConfigLoader[UnitYConfig](asset_store, unity_archs)
			
 
				-
			
 
				-
			
 
				-load_unity_model = ModelLoader[UnitYModel, UnitYConfig](
			
 
				-    asset_store,
			
 
				-    download_manager,
			
 
				-    load_unity_config,
			
 
				+load_unity_model, load_unity_config = setup_model_family(
			
 
				+    UNITY_FAMILY,
			
 
				+    UnitYConfig,
			
 
				     create_unity_model,
			
 
				+    unity_archs,
			
 
				     convert_unity_checkpoint,
			
 
				     restrict_checkpoints=False,
			
 
				 )
			
 
				 
			
 
				+load_unity_text_tokenizer = load_nllb_tokenizer
			
 
				 
			
 
				-load_unity_text_tokenizer = NllbTokenizerLoader(asset_store, download_manager)
			
 
				+register_text_tokenizer(UNITY_FAMILY, load_unity_text_tokenizer)
			
 
				 
			
 
				 
			
 
				 class UnitYUnitTokenizerLoader:
			
--- a/src/seamless_communication/models/unity/model.py
+++ b/src/seamless_communication/models/unity/model.py
@@ -5,7 +5,7 @@
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				 from dataclasses import dataclass
			
 
				-from typing import Optional, Tuple, Union, final
			
 
				+from typing import Final, Optional, Tuple, Union, final
			
 
				 
			
 
				 from fairseq2.data import VocabularyInfo
			
 
				 from fairseq2.models.encoder_decoder import EncoderDecoderModel
			
@@ -23,6 +23,8 @@ from seamless_communication.models.generator.ecapa_tdnn import ECAPA_TDNN
 
				 from seamless_communication.models.unity.fft_decoder import FeedForwardTransformer
			
 
				 from seamless_communication.models.unity.nar_decoder_frontend import NARDecoderFrontend
			
 
				 
			
 
				+UNITY_FAMILY: Final = "unity"
			
 
				+
			
 
				 
			
 
				 @final
			
 
				 class UnitYModel(EncoderDecoderModel):
			
@@ -55,13 +57,14 @@ class UnitYModel(EncoderDecoderModel):
 
				         text_decoder: Optional[TransformerDecoder],
			
 
				         final_proj: Optional[Projection],
			
 
				         t2u_model: Union["UnitYT2UModel", "UnitYNART2UModel", None],
			
 
				+        max_target_seq_len: int,
			
 
				         target_vocab_info: VocabularyInfo,
			
 
				         prosody_encoder_model: Optional[ECAPA_TDNN] = None,
			
 
				         input_modality: str = "speech",
			
 
				     ) -> None:
			
 
				         model_dim = speech_encoder.model_dim
			
 
				 
			
 
				-        super().__init__(model_dim, target_vocab_info)
			
 
				+        super().__init__(UNITY_FAMILY, model_dim, max_target_seq_len, target_vocab_info)
			
 
				 
			
 
				         self.input_modality = input_modality
			
 
				 
			
@@ -190,7 +193,7 @@ class UnitYModel(EncoderDecoderModel):
 
				 
			
 
				         logits = self.final_proj(decoder_output)
			
 
				 
			
 
				-        return SequenceModelOutput(logits, self.target_vocab_info)
			
 
				+        return SequenceModelOutput(logits, self.target_vocab_info.pad_idx)
			
 
				 
			
 
				 
			
 
				 @final
			
@@ -209,11 +212,12 @@ class UnitYX2TModel(EncoderDecoderModel):
 
				         decoder_frontend: TransformerFrontend,
			
 
				         decoder: TransformerDecoder,
			
 
				         final_proj: Projection,
			
 
				+        max_target_seq_len: int,
			
 
				         target_vocab_info: VocabularyInfo,
			
 
				     ) -> None:
			
 
				         model_dim = encoder.model_dim
			
 
				 
			
 
				-        super().__init__(model_dim, target_vocab_info)
			
 
				+        super().__init__(UNITY_FAMILY, model_dim, max_target_seq_len, target_vocab_info)
			
 
				 
			
 
				         self.encoder_frontend = encoder_frontend
			
 
				         self.encoder = encoder
			
@@ -257,7 +261,7 @@ class UnitYX2TModel(EncoderDecoderModel):
 
				     ) -> SequenceModelOutput:
			
 
				         logits = self.final_proj(decoder_output)
			
 
				 
			
 
				-        return SequenceModelOutput(logits, self.target_vocab_info)
			
 
				+        return SequenceModelOutput(logits, self.target_vocab_info.pad_idx)
			
 
				 
			
 
				 
			
 
				 @final
			
@@ -276,9 +280,10 @@ class UnitYT2UModel(EncoderDecoderModel):
 
				         decoder_frontend: TransformerFrontend,
			
 
				         decoder: TransformerDecoder,
			
 
				         final_proj: Projection,
			
 
				+        max_target_seq_len: int,
			
 
				         target_vocab_info: VocabularyInfo,
			
 
				     ) -> None:
			
 
				-        super().__init__(decoder.model_dim, target_vocab_info)
			
 
				+        super().__init__(UNITY_FAMILY, decoder.model_dim, max_target_seq_len, target_vocab_info)
			
 
				 
			
 
				         if encoder is not None:
			
 
				             self.encoder = encoder
			
@@ -324,7 +329,7 @@ class UnitYT2UModel(EncoderDecoderModel):
 
				     ) -> SequenceModelOutput:
			
 
				         logits = self.final_proj(decoder_output)
			
 
				 
			
 
				-        return SequenceModelOutput(logits, self.target_vocab_info)
			
 
				+        return SequenceModelOutput(logits, self.target_vocab_info.pad_idx)
			
 
				 
			
 
				 
			
 
				 @final
			
@@ -438,7 +443,7 @@ class UnitYNART2UModel(Module):
 
				     def project(self, decoder_output: Tensor) -> SequenceModelOutput:
			
 
				         logits = self.final_proj(decoder_output)
			
 
				 
			
 
				-        return SequenceModelOutput(logits, self.target_vocab_info)
			
 
				+        return SequenceModelOutput(logits, self.target_vocab_info.pad_idx)
			
 
				 
			
 
				 
			
 
				 @dataclass
			
--- a/src/seamless_communication/models/unity/nar_decoder_frontend.py
+++ b/src/seamless_communication/models/unity/nar_decoder_frontend.py
@@ -15,7 +15,7 @@ from fairseq2.nn.normalization import LayerNorm
 
				 from fairseq2.nn.padding import PaddingMask
			
 
				 from fairseq2.nn.position_encoder import PositionEncoder
			
 
				 from fairseq2.nn.transformer import create_standard_layer_norm
			
 
				-from fairseq2.typing import DataType, Device, finaloverride
			
 
				+from fairseq2.typing import DataType, Device, override
			
 
				 from torch import Tensor
			
 
				 from torch.nn import Dropout, Module, Parameter
			
 
				 
			
@@ -296,7 +296,7 @@ class NARDecoderFrontend(Module):
 
				 
			
 
				         return seqs
			
 
				 
			
 
				-    @finaloverride
			
 
				+    @override
			
 
				     def forward(
			
 
				         self,
			
 
				         encoder_output: Tensor,
			
--- a/src/seamless_communication/models/unity/t2u_builder.py
+++ b/src/seamless_communication/models/unity/t2u_builder.py
@@ -6,15 +6,14 @@
 
				 from dataclasses import dataclass
			
 
				 from typing import Literal, Optional, Union
			
 
				 
			
 
				-from fairseq2.assets import asset_store, download_manager
			
 
				 from fairseq2.assets.card import AssetCard
			
 
				 from fairseq2.data import VocabularyInfo
			
 
				-from fairseq2.models.nllb.loader import NllbTokenizerLoader
			
 
				+from fairseq2.models.nllb import load_nllb_tokenizer
			
 
				 from fairseq2.models.transformer import (
			
 
				     TransformerEmbeddingFrontend,
			
 
				     TransformerFrontend,
			
 
				 )
			
 
				-from fairseq2.models.utils.arch_registry import ArchitectureRegistry
			
 
				+from fairseq2.models.architecture_registry import ModelArchitectureRegistry
			
 
				 from fairseq2.nn.embedding import Embedding, StandardEmbedding, init_scaled_embedding
			
 
				 from fairseq2.nn.position_encoder import SinusoidalPositionEncoder
			
 
				 from fairseq2.nn.projection import Linear, Projection, TiedProjection
			
@@ -131,8 +130,7 @@ class UnitYT2UConfig:
 
				     """The dimensionality of prosody encoder (e.g. ECAPA_TDNN) output"""
			
 
				 
			
 
				 
			
 
				-unity_t2u_archs = ArchitectureRegistry[UnitYT2UConfig]("unity_t2u")
			
 
				-
			
 
				+unity_t2u_archs = ModelArchitectureRegistry[UnitYT2UConfig]()
			
 
				 
			
 
				 unity_t2u_arch = unity_t2u_archs.decorator
			
 
				 
			
@@ -329,6 +327,7 @@ class UnitYT2UBuilder:
 
				             decoder_frontend,
			
 
				             decoder,
			
 
				             final_proj,
			
 
				+            self.config.unit_max_seq_len,
			
 
				             self.config.target_vocab_info,
			
 
				         )
			
 
				 
			
@@ -598,7 +597,7 @@ class UnitYNART2UBuilder:
 
				             self.config.nar_decoder_frontend_config
			
 
				         )
			
 
				 
			
 
				-        nllb_tokenizer = NllbTokenizerLoader(asset_store, download_manager)(
			
 
				+        nllb_tokenizer = load_nllb_tokenizer(
			
 
				             self.config.nar_decoder_config.model_name_or_card
			
 
				         )
			
 
				 
			
--- a/src/seamless_communication/models/vocoder/builder.py
+++ b/src/seamless_communication/models/vocoder/builder.py
@@ -7,7 +7,7 @@
 
				 from dataclasses import dataclass
			
 
				 from typing import Any, Dict, List, Optional
			
 
				 
			
 
				-from fairseq2.models.utils.arch_registry import ArchitectureRegistry
			
 
				+from fairseq2.models.architecture_registry import ModelArchitectureRegistry
			
 
				 from fairseq2.typing import DataType, Device
			
 
				 
			
 
				 from seamless_communication.models.vocoder.codehifigan import CodeGenerator
			
@@ -34,7 +34,7 @@ class VocoderConfig:
 
				     lang_spkr_idx_map: Dict[str, Any]
			
 
				 
			
 
				 
			
 
				-vocoder_archs = ArchitectureRegistry[VocoderConfig]("vocoder_code_hifigan")
			
 
				+vocoder_archs = ModelArchitectureRegistry[VocoderConfig]()
			
 
				 
			
 
				 vocoder_arch = vocoder_archs.decorator
			
 
				 
			
--- a/src/seamless_communication/models/vocoder/loader.py
+++ b/src/seamless_communication/models/vocoder/loader.py
@@ -4,22 +4,21 @@
 
				 # This source code is licensed under the license found in the
			
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				-from typing import Any, Mapping
			
 
				+from typing import Any, Dict
			
 
				 
			
 
				-from fairseq2.assets import asset_store, download_manager
			
 
				-from fairseq2.models.utils import ConfigLoader, ModelLoader
			
 
				+from fairseq2.models import setup_model_family
			
 
				 
			
 
				 from seamless_communication.models.vocoder.builder import (
			
 
				     VocoderConfig,
			
 
				     create_vocoder_model,
			
 
				     vocoder_archs,
			
 
				 )
			
 
				-from seamless_communication.models.vocoder.vocoder import Vocoder
			
 
				+from seamless_communication.models.vocoder.vocoder import VOCODER_CODE_HIFIGAN_FAMILY
			
 
				 
			
 
				 
			
 
				 def convert_vocoder_checkpoint(
			
 
				-    checkpoint: Mapping[str, Any], config: VocoderConfig
			
 
				-) -> Mapping[str, Any]:
			
 
				+    checkpoint: Dict[str, Any], config: VocoderConfig
			
 
				+) -> Dict[str, Any]:
			
 
				     if (
			
 
				         "model" in checkpoint
			
 
				         and "code_generator.resblocks.0.convs1.0.weight_g" in checkpoint["model"]
			
@@ -36,13 +35,10 @@ def convert_vocoder_checkpoint(
 
				     return checkpoint
			
 
				 
			
 
				 
			
 
				-load_vocoder_config = ConfigLoader[VocoderConfig](asset_store, vocoder_archs)
			
 
				-
			
 
				-
			
 
				-load_vocoder_model = ModelLoader[Vocoder, VocoderConfig](
			
 
				-    asset_store,
			
 
				-    download_manager,
			
 
				-    load_vocoder_config,
			
 
				+load_vocoder_model, load_vocoder_config = setup_model_family(
			
 
				+    VOCODER_CODE_HIFIGAN_FAMILY,
			
 
				+    VocoderConfig,
			
 
				     create_vocoder_model,
			
 
				+    vocoder_archs,
			
 
				     convert_vocoder_checkpoint,
			
 
				 )
			
--- a/src/seamless_communication/models/vocoder/vocoder.py
+++ b/src/seamless_communication/models/vocoder/vocoder.py
@@ -4,21 +4,22 @@
 
				 # This source code is licensed under the license found in the
			
 
				 # MIT_LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				-from typing import Any, Dict, Optional, List, Union
			
 
				+from typing import Any, Dict, Optional, Final, List, Union
			
 
				 import torch
			
 
				 from torch import Tensor
			
 
				-from torch.nn import Module
			
 
				+from fairseq2.models import Model
			
 
				 
			
 
				 from seamless_communication.models.vocoder.codehifigan import CodeGenerator
			
 
				 
			
 
				+VOCODER_CODE_HIFIGAN_FAMILY: Final = "vocoder_code_hifigan"
			
 
				 
			
 
				-class Vocoder(Module):
			
 
				+class Vocoder(Model):
			
 
				     def __init__(
			
 
				         self,
			
 
				         code_generator: CodeGenerator,
			
 
				         lang_spkr_idx_map: Dict[str, Any],
			
 
				     ):
			
 
				-        super().__init__()
			
 
				+        super().__init__(VOCODER_CODE_HIFIGAN_FAMILY)
			
 
				         self.code_generator = code_generator
			
 
				         self.lang_spkr_idx_map = lang_spkr_idx_map
			
 
				 
			
@@ -29,7 +30,7 @@ class Vocoder(Module):
 
				         spkr_list: Union[Optional[List[int]], int] = None,
			
 
				         dur_prediction: bool = True,
			
 
				     ) -> Tensor:
			
 
				-        # TODO: Do we need this backward compatibility, or just update all calling sites? 
			
 
				+        # TODO: Do we need this backward compatibility, or just update all calling sites?
			
 
				         if len(units.shape) == 1:
			
 
				             units = units.unsqueeze(0) # add batch dim
			
 
				         if isinstance(lang_list, str):
			
--- a/src/seamless_communication/toxicity/etox_bad_word_checker.py
+++ b/src/seamless_communication/toxicity/etox_bad_word_checker.py
@@ -16,7 +16,6 @@ from fairseq2.assets import (
 
				     asset_store as base_asset_store,
			
 
				     download_manager as base_download_manager,
			
 
				 )
			
 
				-from fairseq2.data import StringLike
			
 
				 from fairseq2.data.text import SentencePieceEncoder, SentencePieceModel
			
 
				 
			
 
				 
			
@@ -116,7 +115,7 @@ class ETOXBadWordChecker:
 
				 
			
 
				     @staticmethod
			
 
				     def _contains_tokens(
			
 
				-        text_tokens: List[StringLike], word_tokens: List[StringLike]
			
 
				+        text_tokens: List[str], word_tokens: List[str]
			
 
				     ) -> bool:
			
 
				         for i in range(len(text_tokens) - len(word_tokens) + 1):
			
 
				             for j in range(len(word_tokens)):
			
--- a/src/seamless_communication/toxicity/mintox.py
+++ b/src/seamless_communication/toxicity/mintox.py
@@ -18,7 +18,6 @@ from seamless_communication.toxicity.etox_bad_word_checker import (
 
				 )
			
 
				 from fairseq2.generation import BannedSequenceProcessor
			
 
				 from fairseq2.data.text.text_tokenizer import TextTokenizer
			
 
				-from fairseq2.data.typing import StringLike
			
 
				 from fairseq2.typing import Device
			
 
				 from fairseq2.data import SequenceData
			
 
				 from fairseq2.nn.padding import get_seqs_and_padding_mask
			
@@ -32,8 +31,8 @@ logger = logging.getLogger(__name__)
 
				 
			
 
				 
			
 
				 def _extract_bad_words_with_batch_indices(
			
 
				-    source_texts: List[StringLike],
			
 
				-    target_texts: List[StringLike],
			
 
				+    source_texts: List[str],
			
 
				+    target_texts: List[str],
			
 
				     source_lang: str,
			
 
				     target_lang: str,
			
 
				     bad_word_checker: ETOXBadWordChecker,
			
@@ -54,9 +53,9 @@ def _extract_bad_words_with_batch_indices(
 
				 
			
 
				 
			
 
				 def _replace_with_new_text_output_in_batch(
			
 
				-    original_texts: List[StringLike],
			
 
				+    original_texts: List[str],
			
 
				     indices_with_toxicity: List[int],
			
 
				-    new_texts: List[StringLike],
			
 
				+    new_texts: List[str],
			
 
				 ) -> None:
			
 
				     new_idx = 0
			
 
				     # indices_with_toxicity is a small list, using list should be fast enough.
			
@@ -100,8 +99,8 @@ def mintox_pipeline(
 
				     model_input: SequenceData,
			
 
				     input_modality: "Modality",
			
 
				     output_modality: "Modality",
			
 
				-    src_texts: List[StringLike],
			
 
				-    original_texts: List[StringLike],
			
 
				+    src_texts: List[str],
			
 
				+    original_texts: List[str],
			
 
				     original_units: Optional[Tensor] = None,
			
 
				     unit_generation_ngram_filtering: bool = False,
			
 
				     text_generation_opts: Optional[SequenceGeneratorOptions] = None,
			
@@ -109,7 +108,7 @@ def mintox_pipeline(
 
				     bad_word_checker: ETOXBadWordChecker = None,
			
 
				     duration_factor: float = 1.0,
			
 
				     prosody_encoder_input: Optional[SequenceData] = None,
			
 
				-) -> Tuple[List[StringLike], Optional[Tensor]]:
			
 
				+) -> Tuple[List[str], Optional[Tensor]]:
			
 
				     """MinTox: Mitigation at INference time of added TOXicity."""
			
 
				     from seamless_communication.inference.translator import Modality, Translator