1 anno fa · efe88afa2e
--- a/dev_requirements.txt
+++ b/dev_requirements.txt
@@ -1,4 +1,5 @@
 
				 pytest
			
 
				 black
			
 
				 flake8
			
 
				-isort
			
 
				+isort
			
 
				+mypy
			
--- a/scripts/m4t/train/__init__.py
+++ b/scripts/m4t/train/__init__.py
--- a/scripts/m4t/train/configs.py
+++ b/scripts/m4t/train/configs.py
@@ -0,0 +1,247 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the BSD-style license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+
			
 
				+from dataclasses import dataclass
			
 
				+from typing import Dict, Any, Union, get_origin, get_args, List, Literal, Optional
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class Config:
			
 
				+    def serialize(self):
			
 
				+        asdict = {}
			
 
				+        for key in self.__dataclass_fields__.keys():
			
 
				+            value = getattr(self, key)
			
 
				+            if isinstance(value, Config):
			
 
				+                asdict[key] = value.serialize()
			
 
				+            else:
			
 
				+                asdict[key] = value
			
 
				+        return asdict
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _is_config(cls, type_like: Any) -> bool:
			
 
				+        """ checks if type_like class is a subclass of Config"""
			
 
				+        try:
			
 
				+            if issubclass(type_like, Config):
			
 
				+                return True
			
 
				+        except TypeError:
			
 
				+            pass
			
 
				+        return False
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _is_optional_config(cls, type_like: Any) -> bool:
			
 
				+        """ checks if type_like == Optional[subclass of Config] """
			
 
				+        if not get_origin(type_like) == Union:
			
 
				+            return False
			
 
				+        args = [arg for arg in get_args(type_like) if arg is not type(None)]
			
 
				+        return len(args) == 1 and cls._is_config(args[0])
			
 
				+
			
 
				+    @classmethod
			
 
				+    def deserialize(cls, asdict: Dict[str, Any]):
			
 
				+        kwargs = {}
			
 
				+        for key, field_desc in cls.__dataclass_fields__.items():
			
 
				+            non_null = asdict.get(key) is not None
			
 
				+            # Optional[Config]
			
 
				+            if cls._is_optional_config(field_desc.type):
			
 
				+                if non_null:
			
 
				+                    type_arg = [arg for arg in get_args(field_desc.type) if arg is not type(None)][0]
			
 
				+                    kwargs[key] = type_arg.deserialize(asdict[key])
			
 
				+                else:
			
 
				+                    kwargs[key] = None
			
 
				+            # TODO: add containers with Config
			
 
				+            elif get_origin(field_desc.type) in [Union, List, Dict, Literal]:
			
 
				+                kwargs[key] = asdict.get(key)
			
 
				+            elif cls._is_config(field_desc.type):
			
 
				+                if non_null:
			
 
				+                    kwargs[key] = field_desc.type.deserialize(asdict[key])
			
 
				+                else:
			
 
				+                    kwargs[key] = field_desc.type.default  # type: ignore
			
 
				+            else:
			
 
				+                kwargs[key] = asdict.get(key)
			
 
				+        return cls(**kwargs)
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class TextTokenizationConfig(Config):
			
 
				+    from_model: Optional[str] = "seamlessM4T_large"
			
 
				+    """If set, using a tokenizer from the model cards."""
			
 
				+
			
 
				+    spm_path: Optional[str] = None
			
 
				+    """Path to a custom spm model. Not used if `from_model` is set."""
			
 
				+
			
 
				+    langtoks: Optional[List[str]] = None
			
 
				+    """List of language tokens that should be added. Not used if `from_model` is set."""
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class UnitTokenizationConfig(Config):
			
 
				+    from_model: Optional[str] = "seamlessM4T_large"
			
 
				+    """If set, using tokenizer from a model card."""
			
 
				+
			
 
				+    num_units: Optional[int] = None
			
 
				+    """Alternatively, build custom tokenizer, set number of units"""
			
 
				+
			
 
				+    langtoks: Optional[List[str]] = None
			
 
				+    """List of language tokens that should be added. Not used if `from_model` is set."""
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class AudioProcessingConfig(Config):
			
 
				+    audio_root_dir: str = "/"
			
 
				+    """The root directory of the zipped audio files."""
			
 
				+
			
 
				+    fbanks_standardize_audio: bool = True
			
 
				+
			
 
				+    fbanks_num_mel_bins: int = 80
			
 
				+
			
 
				+    fbanks_waveform_scale: int = 2**15
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class DataLoadingConfig(Config):
			
 
				+    manifest_list_path: Optional[str] = None
			
 
				+    """Path to a file with the list of tsv manifests"""
			
 
				+
			
 
				+    manifest_list: Optional[str] = None
			
 
				+    """Comma separated list of tsv manifests. Can be combined with `manifest_list_path`"""
			
 
				+
			
 
				+    manifest_path_prefix: Optional[str] = None
			
 
				+    """Path prefix to manifest files (root directory)"""
			
 
				+
			
 
				+    audio: AudioProcessingConfig = AudioProcessingConfig()
			
 
				+    """ Audio processing params """
			
 
				+
			
 
				+    text_tokenization: TextTokenizationConfig = TextTokenizationConfig()
			
 
				+    """ Text tokenization params """
			
 
				+
			
 
				+    unit_tokenization: UnitTokenizationConfig = UnitTokenizationConfig()
			
 
				+    """ Units tokenization params """
			
 
				+
			
 
				+    unit_tokenizer_name: Optional[str] = "seamlessM4T_large"
			
 
				+
			
 
				+    prepend_tgt_lang_tag: bool = True
			
 
				+    """ Prepend output text sequence with target lang token"""
			
 
				+
			
 
				+    fbank_feats_pad_idx: int = 0
			
 
				+    """The pad index to use in fbanks batching."""
			
 
				+
			
 
				+    max_tgt_text_tokens_per_batch: Optional[int] = 1000
			
 
				+    """ Defines flexible batch construction """
			
 
				+
			
 
				+    fixed_batch_size: Optional[int] = None
			
 
				+    """ If set, uses fixed batch size """
			
 
				+
			
 
				+    max_seconds_per_input_audio: int = 15
			
 
				+    """Accept only samples with less than max_seconds_per_input_audio ( waveform.shape[0] * SR )"""
			
 
				+
			
 
				+    max_tgt_text_tokens_per_sample: int = 300
			
 
				+    """Accept only samples with less than max_sequence_length units"""
			
 
				+
			
 
				+    max_units_per_sample: int = 1500
			
 
				+    """Accept only samples with less than max_sequence_length units"""
			
 
				+
			
 
				+    num_threads: int = 5
			
 
				+    """The number of parallel threads during data reading and processing."""
			
 
				+
			
 
				+    shuffle_window: Optional[int] = 1000
			
 
				+    """The size of sliding shuffle window."""
			
 
				+
			
 
				+    prefech_batches: Optional[int] = 10
			
 
				+    """How many batches to prefetch in the background."""
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class CustomModelParams(Config):
			
 
				+    model_embed_dim: int = 1024
			
 
				+
			
 
				+    w2v2_encoder_layers: int = 24
			
 
				+
			
 
				+    w2v2_encoder_layers_use_conformer: bool = True
			
 
				+
			
 
				+    w2v2_encoder_layers_layernorm_features: bool = False
			
 
				+
			
 
				+    w2v2_pos_encoder_type: Literal["conv", "relative", "rotary"] = "relative"
			
 
				+
			
 
				+    w2v2_pos_encoder_depth: int = 0
			
 
				+
			
 
				+    w2v2_pos_conv_kernel_size: int = 0
			
 
				+
			
 
				+    w2v2_num_pos_conv_groups: int = 0
			
 
				+
			
 
				+    nllb_encoder_layers: int = 24
			
 
				+
			
 
				+    nllb_decoder_layers: int = 24
			
 
				+
			
 
				+    t2u_encoder_layers: int = 6
			
 
				+
			
 
				+    t2u_decoder_layers: int = 6
			
 
				+
			
 
				+    nllb_vocabulary_size: int = 256102  # num_tokens + langs + spec symbols
			
 
				+
			
 
				+    unit_vocabulary_size: int = 10082
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class ModelConfig(Config):
			
 
				+    from_model: Optional[str] = None
			
 
				+    """If set, initialize a model defined in model cards. Also loads model weights."""
			
 
				+
			
 
				+    from_model_config: Optional[str] = None
			
 
				+    """If set, initialize a model defined in model cards. Doesn't load weights."""
			
 
				+
			
 
				+    custom_params: Optional[CustomModelParams] = None
			
 
				+    """If set, intitalize a new model with custom parameters"""
			
 
				+
			
 
				+    pretrained_w2v2_path: Optional[str] = None
			
 
				+    """If set, use pre-trained w2v block"""
			
 
				+
			
 
				+    pretrained_s2t_decoder_path: Optional[str] = None
			
 
				+    """If set, use pre-trained s2t decoder (NLLB)"""
			
 
				+
			
 
				+    pretrained_t2u_path: Optional[str] = None
			
 
				+    """If set, use pre-trained t2u weights"""
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class TrainingParams(Config):
			
 
				+    max_epochs: int = 100
			
 
				+    """ Maximum number of trainign epochs"""
			
 
				+
			
 
				+    label_smoothing: float = 0.2
			
 
				+    """ Label smoothing coefficient for nll_loss """
			
 
				+
			
 
				+    warmup_steps: int = 1000
			
 
				+    """ Number of steps with linearly increasing LR"""
			
 
				+
			
 
				+    log_steps: int = 200
			
 
				+    """ Log inner loss after each `log_steps` training steps"""
			
 
				+
			
 
				+    eval_steps: int = 1000
			
 
				+    """ Get eval loss after each `eval_steps` training steps """
			
 
				+
			
 
				+    patience: int = 10
			
 
				+    """ Terminate if eval loss did not improve
			
 
				+    over the last `patience * eval_steps` training steps"""
			
 
				+
			
 
				+    learning_rate: float = 1e-4
			
 
				+    """ Optimizer learining rate """
			
 
				+
			
 
				+    start_learning_rate: float = 1e-7
			
 
				+    """ Start learining rate """
			
 
				+
			
 
				+    float_dtype: Literal["fp16", "bf16", "fp32"] = "bf16"
			
 
				+    """ Dtype used for float numbers, defines training precision """
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class WorkflowParams(Config):
			
 
				+    training: TrainingParams
			
 
				+
			
 
				+    model: ModelConfig
			
 
				+
			
 
				+    train_data: DataLoadingConfig
			
 
				+
			
 
				+    eval_data: DataLoadingConfig
			
--- a/scripts/m4t/train/dataloader.py
+++ b/scripts/m4t/train/dataloader.py
@@ -0,0 +1,520 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the BSD-style license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+
			
 
				+import logging
			
 
				+import os
			
 
				+from typing import Any, Dict, Iterator, List, NamedTuple, Optional, Tuple, Union
			
 
				+import ctypes
			
 
				+
			
 
				+import torch
			
 
				+from m4t_scripts.train.configs import AudioProcessingConfig, DataLoadingConfig
			
 
				+from torch import Tensor
			
 
				+
			
 
				+from fairseq2.data import (
			
 
				+    CollateOptionsOverride,
			
 
				+    Collater,
			
 
				+    DataPipeline,
			
 
				+    DataPipelineBuilder,
			
 
				+    FileMapper,
			
 
				+)
			
 
				+from fairseq2.data.audio import AudioDecoder, WaveformToFbankConverter
			
 
				+from fairseq2.data.text import SentencePieceEncoder, StrSplitter, read_text
			
 
				+from fairseq2.models.nllb.tokenizer import NllbTokenizer
			
 
				+from seamless_communication.models.tokenizer import SPMTokenizer
			
 
				+from seamless_communication.models.unity import (
			
 
				+    UnitTokenizer,
			
 
				+    load_unity_text_tokenizer,
			
 
				+    load_unity_unit_tokenizer,
			
 
				+)
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+class SeqsBatch(NamedTuple):
			
 
				+    src_tokens: Optional[Tensor]
			
 
				+    src_lengths: Optional[Tensor]
			
 
				+    target_tokens: Tensor
			
 
				+    prev_output_tokens: Tensor
			
 
				+    target_lengths: Tensor
			
 
				+    prefix_tokens: Optional[Tensor]
			
 
				+
			
 
				+
			
 
				+class MultimodalSeqsBatch(NamedTuple):
			
 
				+    speech_to_text: SeqsBatch
			
 
				+    text_to_units: SeqsBatch
			
 
				+
			
 
				+
			
 
				+class UnityDataLoader:
			
 
				+    CPU_DEVICE = torch.device("cpu")
			
 
				+    MANIFEST_EXT = ".tsv"
			
 
				+    MANIFEST_COLUMN_SEP = "\t"
			
 
				+    AUDIO_COLUMN_NAME = "audio"
			
 
				+    TARGET_TEXT_COLUMN = "raw_tgt_text"
			
 
				+    TARGET_UNITS_COLUMN = "tgt_text"
			
 
				+    TARGET_LANG_COLUMN = "tgt_lang"
			
 
				+    ROOT_COLUMN = "_"
			
 
				+    BATCH_WIDTH_STEP = 8
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: DataLoadingConfig,
			
 
				+        rank: int = 0,
			
 
				+        world_size: int = 1,
			
 
				+        target_device: torch.device = CPU_DEVICE,
			
 
				+        float_dtype: torch.dtype = torch.float16,  # training/inference precision
			
 
				+    ):
			
 
				+        self.config = config
			
 
				+        self.rank = rank
			
 
				+        self.world_size = world_size
			
 
				+        self.target_device = target_device
			
 
				+        self.float_dtype = float_dtype
			
 
				+        self._set_mkl_num_threads()
			
 
				+        self.manifest_paths = list(self._iterate_manifest_paths())
			
 
				+        self.text_tokenizer = self._init_text_tokenizer()
			
 
				+        self.unit_tokenizer = self._init_unit_tokenizer()
			
 
				+        self.spm_encoder = SentencePieceEncoder(model=self.text_tokenizer.model, suffix_tokens=["</s>"])
			
 
				+        self.text_prefix_tokens = self._build_text_tgt_prefixes()
			
 
				+        self.unit_prefix_tokens = self._build_unit_tgt_prefixes()
			
 
				+        if self.config.fixed_batch_size is None:
			
 
				+            self.tgt_text_batch_shapes = self._calculate_tgt_text_batch_shapes()
			
 
				+        else:
			
 
				+            self.tgt_text_batch_shapes = []
			
 
				+
			
 
				+        self.pipeline = self._build_pipeline()
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _set_mkl_num_threads(cls):
			
 
				+        """ Setting mkl num threads to 1, so that we don't get thread explosion."""
			
 
				+        mkl_rt = ctypes.CDLL('libmkl_rt.so')
			
 
				+        mkl_rt.mkl_set_num_threads(ctypes.byref(ctypes.c_int(1)))
			
 
				+
			
 
				+    def _calculate_tgt_text_batch_shapes(self) -> List[Tuple[int, int]]:
			
 
				+        max_seq_len = self.config.max_tgt_text_tokens_per_sample
			
 
				+        max_tokens_per_batch = self.config.max_tgt_text_tokens_per_batch
			
 
				+        assert max_tokens_per_batch is not None, "max_tokens_per_batch is not set"
			
 
				+        step = self.BATCH_WIDTH_STEP
			
 
				+        bucket_sizes = []
			
 
				+        for seq_len in range(step, max(step, max_seq_len) + 1, step):
			
 
				+            bsz = max(1, max_tokens_per_batch // seq_len)
			
 
				+            bucket_sizes.append((bsz, seq_len))
			
 
				+        return bucket_sizes
			
 
				+
			
 
				+    def _build_text_tgt_prefixes(self) -> Dict[str, List[int]]:
			
 
				+        return {
			
 
				+            lang_tok: self.text_tokenizer.create_encoder(
			
 
				+                lang=lang_tok, mode="target"
			
 
				+            ).prefix_indices.tolist()  # type:ignore
			
 
				+            for lang_tok in self.text_tokenizer.langs
			
 
				+        }
			
 
				+
			
 
				+    def _build_unit_tgt_prefixes(self) -> Dict[str, List[int]]:
			
 
				+        assert self.unit_tokenizer.vocab_info.eos_idx is not None
			
 
				+        return {
			
 
				+            lang_tok: [
			
 
				+                self.unit_tokenizer.vocab_info.eos_idx,
			
 
				+                self.unit_tokenizer.lang_to_index(lang_tok),
			
 
				+            ]
			
 
				+            for lang_tok in self.unit_tokenizer.langs
			
 
				+        }  # type: ignore
			
 
				+
			
 
				+    def _init_text_tokenizer(self) -> Union[NllbTokenizer, SPMTokenizer]:
			
 
				+        if self.config.text_tokenization.from_model is not None:
			
 
				+            return load_unity_text_tokenizer(self.config.text_tokenization.from_model)
			
 
				+        else:
			
 
				+            assert self.config.text_tokenization.langtoks is not None
			
 
				+            assert self.config.text_tokenization.spm_path is not None
			
 
				+            return SPMTokenizer(
			
 
				+                pathname=self.config.text_tokenization.spm_path, langs=self.config.text_tokenization.langtoks
			
 
				+            )
			
 
				+
			
 
				+    def _init_unit_tokenizer(self) -> UnitTokenizer:
			
 
				+        if self.config.unit_tokenization.from_model is not None:
			
 
				+            return load_unity_unit_tokenizer(self.config.unit_tokenization.from_model)
			
 
				+        else:
			
 
				+            raise NotImplementedError("TBD")
			
 
				+
			
 
				+    def _load_manifest_list_from_file(self) -> Iterator[str]:
			
 
				+        if self.config.manifest_list_path is not None:
			
 
				+            for line in open(self.config.manifest_list_path).readlines():
			
 
				+                line = line.split("#")[0].strip()  # allow comments
			
 
				+                if line:
			
 
				+                    yield line
			
 
				+
			
 
				+    def _load_raw_manifest_list(self) -> List[str]:
			
 
				+        raw_list = []
			
 
				+        if self.config.manifest_list is not None:
			
 
				+            raw_list += self.config.manifest_list.strip().split(",")
			
 
				+        raw_list += list(self._load_manifest_list_from_file())
			
 
				+        return raw_list
			
 
				+
			
 
				+    def _infer_manifest_full_path(self, manifest_name: str) -> str:
			
 
				+        full_path = manifest_name.strip()
			
 
				+        if self.config.manifest_path_prefix is not None:
			
 
				+            full_path = os.path.join(self.config.manifest_path_prefix.strip(), full_path)
			
 
				+        if not full_path.endswith(self.MANIFEST_EXT) and not os.path.exists(full_path):
			
 
				+            full_path += self.MANIFEST_EXT
			
 
				+        if not os.path.exists(full_path):
			
 
				+            raise FileNotFoundError(f"File not found {full_path}")
			
 
				+        return full_path
			
 
				+
			
 
				+    def _iterate_manifest_paths(self, skip_missing_files: bool = True) -> Iterator[str]:
			
 
				+        """Yields full paths to manifests described in the data config.
			
 
				+        Check that each file exist.
			
 
				+        Expects *.tsv files"""
			
 
				+        raw_list = self._load_raw_manifest_list()
			
 
				+        for manifest_name in raw_list:
			
 
				+            try:
			
 
				+                full_path = self._infer_manifest_full_path(manifest_name=manifest_name)
			
 
				+            except FileNotFoundError:
			
 
				+                if skip_missing_files:
			
 
				+                    logger.warning(f"Skipping manifest {manifest_name}, file not found")
			
 
				+                    continue
			
 
				+                raise
			
 
				+            yield full_path
			
 
				+
			
 
				+    def _read_column_names(self, manifest_path: str) -> List[str]:
			
 
				+        """Gets the order of columns in the manifest file.
			
 
				+        Also checks that expected columns are present."""
			
 
				+        with open(manifest_path, "r") as in_fp:
			
 
				+            column_names = in_fp.readline().strip().split("\t")
			
 
				+        for column in [
			
 
				+            self.AUDIO_COLUMN_NAME,
			
 
				+            self.TARGET_TEXT_COLUMN,
			
 
				+            self.TARGET_UNITS_COLUMN,
			
 
				+            self.TARGET_LANG_COLUMN,
			
 
				+        ]:
			
 
				+            if column not in column_names:
			
 
				+                raise ValueError(f"Column `{column}` is not present in `{manifest_path}` ")
			
 
				+        return column_names
			
 
				+
			
 
				+    def _builder_from_manifest(self, manifest_path: str) -> DataPipelineBuilder:
			
 
				+        """Creates a data pipeline builder for the specified manifest_path file."""
			
 
				+        logger.debug(f"Initialiazing samples loader from {manifest_path}")
			
 
				+
			
 
				+        # Memory map file and read it in text mode (skip empty lines if any).
			
 
				+        # Skip header.
			
 
				+        tsv_lines = (
			
 
				+            read_text(
			
 
				+                pathname=manifest_path,
			
 
				+                encoding="UTF-8",
			
 
				+                rtrim=True,
			
 
				+                skip_empty=True,
			
 
				+                memory_map=True,
			
 
				+            )
			
 
				+            .skip(1)
			
 
				+            .and_return()
			
 
				+        )
			
 
				+
			
 
				+        # Assing column names:
			
 
				+        # line content: `_`
			
 
				+        # source manifest path: `manifest_path`
			
 
				+        # line number: `lineno`
			
 
				+        line_numbers = DataPipeline.count().and_return()
			
 
				+        filename_const = DataPipeline.constant(manifest_path).and_return()
			
 
				+        pipeline = DataPipeline.zip(
			
 
				+            [tsv_lines, filename_const, line_numbers],
			
 
				+            names=[self.ROOT_COLUMN, "manifest_path", "lineno"],
			
 
				+            zip_to_shortest=True,
			
 
				+        )
			
 
				+
			
 
				+        # Read every `world_size`th line starting from `rank`th item in the file.
			
 
				+        pipeline.shard(self.rank, self.world_size)
			
 
				+
			
 
				+        if self.config.shuffle_window is not None:
			
 
				+            pipeline.shuffle(self.config.shuffle_window)
			
 
				+
			
 
				+        # Split each text line into its fields.
			
 
				+        fields = self._read_column_names(manifest_path)
			
 
				+        logger.debug(f"Column names: {fields}")
			
 
				+        txt_splitter = StrSplitter(sep=self.MANIFEST_COLUMN_SEP, names=fields, indices=[], exclude=True)
			
 
				+        pipeline.map(
			
 
				+            txt_splitter,
			
 
				+            selector=self.ROOT_COLUMN,
			
 
				+            num_parallel_calls=self.config.num_threads,
			
 
				+        )
			
 
				+        # And, create the pipeline for the TSV file.
			
 
				+        return pipeline
			
 
				+
			
 
				+    def _get_manifest_funnel(self) -> DataPipelineBuilder:
			
 
				+        """Creates a joined pipeline from all manifests.
			
 
				+        Picks samples from per-manifest pipelines in a round-robin order"""
			
 
				+        # TODO: add the ability to upsample/downsample manifests
			
 
				+        logger.info(f"Aggregating data from {len(self.manifest_paths)} manifests")
			
 
				+        builders = [self._builder_from_manifest(manifest_path=path) for path in self.manifest_paths]
			
 
				+        pipelines = [builder.and_return() for builder in builders]
			
 
				+        return DataPipeline.round_robin(pipelines=pipelines)
			
 
				+
			
 
				+    def _attach_audio(self, builder: DataPipelineBuilder) -> DataPipelineBuilder:
			
 
				+        """Attaches audio waveforms and fbanks from linked autio files"""
			
 
				+        audio_selector = f"{self.ROOT_COLUMN}.{self.AUDIO_COLUMN_NAME}"
			
 
				+        audio_data_selector = f"{audio_selector}.data"
			
 
				+
			
 
				+        # Memory map each `audio_file`
			
 
				+        map_file = FileMapper(self.config.audio.audio_root_dir, cached_fd_count=100)
			
 
				+        builder.map(
			
 
				+            map_file,
			
 
				+            selector=audio_selector,
			
 
				+            num_parallel_calls=self.config.num_threads,
			
 
				+        )
			
 
				+
			
 
				+        # Decode each mmap'ed audio file using libsndfile.
			
 
				+        decode_audio = AudioDecoder(dtype=torch.float32)
			
 
				+        builder.map(
			
 
				+            decode_audio,
			
 
				+            selector=audio_data_selector,
			
 
				+            num_parallel_calls=self.config.num_threads,
			
 
				+        )
			
 
				+
			
 
				+        # And, convert from waveform to log-mel filterbank
			
 
				+        convert_to_fbank = WaveformToFbankConverter(
			
 
				+            num_mel_bins=self.config.audio.fbanks_num_mel_bins,
			
 
				+            waveform_scale=self.config.audio.fbanks_waveform_scale,
			
 
				+            channel_last=True,  # audio channel is the last dimension in the waveform
			
 
				+            standardize=self.config.audio.fbanks_standardize_audio,
			
 
				+            keep_waveform=False,
			
 
				+            device=self.target_device,
			
 
				+            dtype=self.float_dtype,
			
 
				+        )
			
 
				+        builder.map(
			
 
				+            convert_to_fbank,
			
 
				+            selector=audio_data_selector,
			
 
				+            num_parallel_calls=self.config.num_threads,
			
 
				+        )
			
 
				+        return builder
			
 
				+
			
 
				+    def _attach_target_tokens(self, builder: DataPipelineBuilder) -> DataPipelineBuilder:
			
 
				+        # Convert `raw_tgt_text` to (full) target tokenized sequences:
			
 
				+        #                   <eos> <lang_tok> <tokens .. > <eos>
			
 
				+        # Lang tokens change between rows, so can't use static encoder
			
 
				+        builder.map(
			
 
				+            [self.spm_encoder],
			
 
				+            selector=f"{self.ROOT_COLUMN}.{self.TARGET_TEXT_COLUMN}",
			
 
				+            num_parallel_calls=self.config.num_threads,
			
 
				+        )
			
 
				+
			
 
				+        # Convert the `tgt_text` field into a unit tensor + EOS
			
 
				+        # TODO: We should use unit tokenizer.
			
 
				+        # Motivation for the current implementation:
			
 
				+        # 1) lang_tok can change between rows.
			
 
				+        #       If we want to attach lang_token_id here, we need a way to join values from two columns
			
 
				+        # 2) StrToTensorConverter doesn't allow suffix tokens. Adding it later is less covenient.
			
 
				+        # 3) Not a computational blocker
			
 
				+        convert_to_units = lambda units_str: (  # noqa: E731
			
 
				+            torch.LongTensor(
			
 
				+                [int(unit_id) + 4 for unit_id in units_str.rstrip().bytes().decode("utf-8").split()]
			
 
				+                + [self.unit_tokenizer.vocab_info.eos_idx]
			
 
				+            )
			
 
				+        )
			
 
				+        builder.map(
			
 
				+            [convert_to_units],
			
 
				+            selector=f"{self.ROOT_COLUMN}.{self.TARGET_UNITS_COLUMN}",
			
 
				+            num_parallel_calls=self.config.num_threads,
			
 
				+        )
			
 
				+
			
 
				+        # prefixes for tokenized texts and speech units (<eos> <lang_tok>)
			
 
				+        prefix_builder = lambda lang_tok: torch.LongTensor(  # noqa: E731
			
 
				+            [
			
 
				+                self.text_prefix_tokens[lang_tok.bytes().decode("utf8")],
			
 
				+                self.unit_prefix_tokens[lang_tok.bytes().decode("utf8")],
			
 
				+            ]
			
 
				+        )
			
 
				+        builder.map(
			
 
				+            [prefix_builder],
			
 
				+            selector=f"{self.ROOT_COLUMN}.{self.TARGET_LANG_COLUMN}",
			
 
				+            num_parallel_calls=self.config.num_threads,
			
 
				+        )
			
 
				+        return builder
			
 
				+
			
 
				+    def _get_input_audio_seconds(self, sample: Any) -> float:
			
 
				+        audio_data = sample[self.ROOT_COLUMN][self.AUDIO_COLUMN_NAME]["data"]
			
 
				+        input_audio_sample_rate = audio_data["sample_rate"]
			
 
				+        num_fbanks = max(audio_data["fbank"].shape)  # not guessing the dim order
			
 
				+        # TODO: clarify where '* 2' comes from
			
 
				+        waveform_length = num_fbanks * self.config.audio.fbanks_num_mel_bins * 2
			
 
				+        input_audio_seconds = waveform_length / input_audio_sample_rate
			
 
				+        return input_audio_seconds
			
 
				+
			
 
				+    def _is_long_sample(self, sample: Any) -> bool:
			
 
				+        # input audio length
			
 
				+        if self._get_input_audio_seconds(sample) > self.config.max_seconds_per_input_audio:
			
 
				+            return True
			
 
				+
			
 
				+        # target text tokens
			
 
				+        num_tgt_text_tokens = sample[self.ROOT_COLUMN][self.TARGET_TEXT_COLUMN].shape[-1]
			
 
				+        if num_tgt_text_tokens > self.config.max_tgt_text_tokens_per_sample:
			
 
				+            return True
			
 
				+
			
 
				+        # target units
			
 
				+        num_tgt_units = sample[self.ROOT_COLUMN][self.TARGET_UNITS_COLUMN].shape[-1]  # target units
			
 
				+        if num_tgt_units > self.config.max_units_per_sample:
			
 
				+            return True
			
 
				+        return False
			
 
				+
			
 
				+    def _filter_samples(self, builder: DataPipelineBuilder) -> DataPipelineBuilder:
			
 
				+        # Drop long samples
			
 
				+        builder.filter(lambda sample: not self._is_long_sample(sample))
			
 
				+        return builder
			
 
				+
			
 
				+    def _batch_samples(self, builder: DataPipelineBuilder) -> DataPipelineBuilder:
			
 
				+        if self.config.fixed_batch_size is not None:
			
 
				+            builder.bucket(bucket_size=self.config.fixed_batch_size)
			
 
				+        elif self.tgt_text_batch_shapes is not None:
			
 
				+            builder.bucket_by_length(
			
 
				+                self.tgt_text_batch_shapes,
			
 
				+                selector=f"{self.ROOT_COLUMN}.{self.TARGET_TEXT_COLUMN}",
			
 
				+            )
			
 
				+        else:
			
 
				+            raise ValueError("Unclear batching strategy")
			
 
				+        # Collate bucketed elements into a batch.
			
 
				+        collater = Collater(
			
 
				+            pad_to_multiple=1,
			
 
				+            overrides=[
			
 
				+                CollateOptionsOverride(
			
 
				+                    selector=f"{self.ROOT_COLUMN}.{self.AUDIO_COLUMN_NAME}.data.fbank",
			
 
				+                    pad_idx=self.config.fbank_feats_pad_idx,
			
 
				+                ),
			
 
				+                CollateOptionsOverride(
			
 
				+                    selector=f"{self.ROOT_COLUMN}.{self.TARGET_TEXT_COLUMN}",
			
 
				+                    pad_idx=self.text_tokenizer.vocab_info.pad_idx,
			
 
				+                ),
			
 
				+                CollateOptionsOverride(
			
 
				+                    selector=f"{self.ROOT_COLUMN}.{self.TARGET_UNITS_COLUMN}",
			
 
				+                    pad_idx=self.unit_tokenizer.vocab_info.pad_idx,
			
 
				+                ),
			
 
				+            ],
			
 
				+        )
			
 
				+        builder.map(collater, num_parallel_calls=self.config.num_threads)
			
 
				+        if self.config.prefech_batches is not None:
			
 
				+            builder.prefetch(self.config.prefech_batches)
			
 
				+        return builder
			
 
				+
			
 
				+    def _build_pipeline(self) -> DataPipeline:
			
 
				+        data = self._get_manifest_funnel()
			
 
				+        data = self._attach_audio(data)
			
 
				+        data = self._attach_target_tokens(data)
			
 
				+        data = self._filter_samples(data)
			
 
				+        batches = self._batch_samples(data)
			
 
				+        return batches.and_return()
			
 
				+
			
 
				+    def _gen_prev_toks_target_toks_target_lens(
			
 
				+        self, seqs: Any, prefix_tokens: torch.Tensor, pad_idx: int, eos_idx: int
			
 
				+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
			
 
				+        # <eos> <lang_tok> ... <eos> <pad>*
			
 
				+        tokens = torch.cat((prefix_tokens, seqs["seqs"]), 1)
			
 
				+        target_lengths = seqs["seq_lens"] + 1  # + <leng_tok>
			
 
				+
			
 
				+        prev_output_tokens = torch.clone(tokens)
			
 
				+        # replace last <eos> with <pad> and remove last column
			
 
				+        mask = prev_output_tokens == eos_idx
			
 
				+        mask[:, 0] = 0
			
 
				+        prev_output_tokens[mask] = pad_idx
			
 
				+        prev_output_tokens = prev_output_tokens[:, :-1]
			
 
				+
			
 
				+        target_tokens = tokens[:, 1:]
			
 
				+        assert torch.equal(torch.count_nonzero(prev_output_tokens != pad_idx, dim=1), target_lengths)
			
 
				+        assert torch.equal(torch.count_nonzero(target_tokens != pad_idx, dim=1), target_lengths)
			
 
				+        return prev_output_tokens, target_tokens, target_lengths
			
 
				+
			
 
				+    def _get_text_to_units_batch(self, raw_batch: Any) -> SeqsBatch:
			
 
				+        root = raw_batch[self.ROOT_COLUMN]
			
 
				+        seqs = root[self.TARGET_UNITS_COLUMN]
			
 
				+        prefix_tokens = root[self.TARGET_LANG_COLUMN][:, 1, :]
			
 
				+        pad_idx = self.unit_tokenizer.vocab_info.pad_idx
			
 
				+        eos_idx = self.unit_tokenizer.vocab_info.eos_idx
			
 
				+        assert pad_idx is not None
			
 
				+        assert eos_idx is not None
			
 
				+
			
 
				+        (
			
 
				+            prev_output_tokens,
			
 
				+            target_tokens,
			
 
				+            target_lengths,
			
 
				+        ) = self._gen_prev_toks_target_toks_target_lens(
			
 
				+            seqs=seqs,
			
 
				+            prefix_tokens=prefix_tokens,
			
 
				+            pad_idx=pad_idx,
			
 
				+            eos_idx=eos_idx,
			
 
				+        )
			
 
				+
			
 
				+        return SeqsBatch(
			
 
				+            src_tokens=None,
			
 
				+            src_lengths=None,
			
 
				+            target_tokens=target_tokens.to(self.target_device),
			
 
				+            prev_output_tokens=prev_output_tokens.to(self.target_device),
			
 
				+            target_lengths=target_lengths.to(self.target_device),
			
 
				+            prefix_tokens=prefix_tokens.to(self.target_device),
			
 
				+        )
			
 
				+
			
 
				+    def _get_speech_src_tokens_and_lengths(self, raw_batch: Any) -> Tuple[torch.Tensor, torch.Tensor]:
			
 
				+        fbanks = raw_batch[self.ROOT_COLUMN][self.AUDIO_COLUMN_NAME]["data"]["fbank"]
			
 
				+        return fbanks["seqs"].to(self.float_dtype), fbanks["seq_lens"]
			
 
				+
			
 
				+    def _get_speech_to_text_batch(self, raw_batch: Any) -> SeqsBatch:
			
 
				+        root = raw_batch[self.ROOT_COLUMN]
			
 
				+        seqs = root[self.TARGET_TEXT_COLUMN]
			
 
				+        prefix_tokens = root[self.TARGET_LANG_COLUMN][:, 0, :]
			
 
				+        pad_idx = self.text_tokenizer.vocab_info.pad_idx
			
 
				+        assert pad_idx is not None
			
 
				+        eos_idx = self.text_tokenizer.vocab_info.eos_idx
			
 
				+        assert eos_idx is not None
			
 
				+
			
 
				+        (
			
 
				+            prev_output_tokens,
			
 
				+            target_tokens,
			
 
				+            target_lengths,
			
 
				+        ) = self._gen_prev_toks_target_toks_target_lens(
			
 
				+            seqs=seqs,
			
 
				+            prefix_tokens=prefix_tokens,
			
 
				+            pad_idx=pad_idx,
			
 
				+            eos_idx=eos_idx,
			
 
				+        )
			
 
				+        src_tokens, src_lengths = self._get_speech_src_tokens_and_lengths(raw_batch=raw_batch)
			
 
				+
			
 
				+        return SeqsBatch(
			
 
				+            src_tokens=src_tokens.to(self.target_device),
			
 
				+            src_lengths=src_lengths.to(self.target_device),
			
 
				+            target_tokens=target_tokens.to(self.target_device),
			
 
				+            prev_output_tokens=prev_output_tokens.to(self.target_device),
			
 
				+            target_lengths=target_lengths.to(self.target_device),
			
 
				+            prefix_tokens=prefix_tokens.to(self.target_device),
			
 
				+        )
			
 
				+
			
 
				+    def _convert_to_mulitmodal_seqs_batch(self, raw_batch: Any) -> MultimodalSeqsBatch:
			
 
				+        return MultimodalSeqsBatch(
			
 
				+            speech_to_text=self._get_speech_to_text_batch(raw_batch=raw_batch),
			
 
				+            text_to_units=self._get_text_to_units_batch(raw_batch=raw_batch),
			
 
				+        )
			
 
				+
			
 
				+    def iterate_batches(self) -> Iterator[MultimodalSeqsBatch]:
			
 
				+        for raw_batch in self.pipeline:
			
 
				+            yield self._convert_to_mulitmodal_seqs_batch(raw_batch)
			
 
				+
			
 
				+    def reset(self) -> None:
			
 
				+        self.pipeline.reset()
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    logging.basicConfig(
			
 
				+        level=logging.INFO,
			
 
				+        format=f"%(asctime)s %(levelname)s -- %(name)s.{os.getpid()}: %(message)s",
			
 
				+    )
			
 
				+    config = DataLoadingConfig(
			
 
				+        audio=AudioProcessingConfig(
			
 
				+            audio_root_dir="/fsx-ust/data/audio_zips/",
			
 
				+        ),
			
 
				+        manifest_path_prefix="/fsx-ust/spopuri/datasets/S2ST/V1/M4T_V1_phase2/primary",
			
 
				+        manifest_list_path="/data/home/mavlyutov/train_manifests.txt",
			
 
				+        shuffle_window=1000,
			
 
				+        num_threads=5,
			
 
				+    )
			
 
				+    loader = UnityDataLoader(config=config, target_device=torch.device("cpu"))
			
 
				+    for idx, batch in enumerate(loader.iterate_batches()):
			
 
				+        if idx % 10 == 0:
			
 
				+            assert batch.speech_to_text.src_tokens is not None
			
 
				+            print(batch.speech_to_text.src_tokens.shape)
			
 
				+            logger.info(f".. pulled {idx} batches")
			
 
				+            if idx > 1000:
			
 
				+                break
			
--- a/scripts/m4t/train/dist_utils.py
+++ b/scripts/m4t/train/dist_utils.py
@@ -0,0 +1,76 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+
			
 
				+import logging
			
 
				+import os
			
 
				+from datetime import timedelta
			
 
				+from typing import List
			
 
				+
			
 
				+import torch
			
 
				+import torch.distributed as dist
			
 
				+import torch.multiprocessing
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+def is_dist_initialized() -> bool:
			
 
				+    if not dist.is_available():
			
 
				+        return False
			
 
				+    if not dist.is_initialized():
			
 
				+        return False
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def get_rank() -> int:
			
 
				+    if not is_dist_initialized():
			
 
				+        return 0
			
 
				+    return dist.get_rank()
			
 
				+
			
 
				+
			
 
				+def get_local_rank() -> int:
			
 
				+    if not is_dist_initialized():
			
 
				+        return 0
			
 
				+    return int(os.environ["LOCAL_RANK"])
			
 
				+
			
 
				+
			
 
				+def get_world_size() -> int:
			
 
				+    if not is_dist_initialized():
			
 
				+        return 1
			
 
				+    return dist.get_world_size()
			
 
				+
			
 
				+
			
 
				+def is_main_process() -> bool:
			
 
				+    return get_rank() == 0
			
 
				+
			
 
				+
			
 
				+def init_distributed(loggers: List[logging.Logger]) -> None:
			
 
				+    """Initializes the distributed backend"""
			
 
				+    torch.multiprocessing.set_start_method("spawn")
			
 
				+    if "RANK" not in os.environ:
			
 
				+        logger.error(
			
 
				+            "Cannot init disributed context, as environment varaibles are not set."
			
 
				+        )
			
 
				+        return
			
 
				+    rank = int(os.environ["RANK"])
			
 
				+    world_size = int(os.environ["WORLD_SIZE"])
			
 
				+    local_rank = int(os.environ["LOCAL_RANK"])
			
 
				+    logger.info(
			
 
				+        f"Rank={rank} local rank={local_rank}, world_size={world_size}, is_master={rank == 0}"
			
 
				+    )
			
 
				+    dist.init_process_group(
			
 
				+        backend="nccl",
			
 
				+        init_method="env://",
			
 
				+        world_size=world_size,
			
 
				+        rank=rank,
			
 
				+        timeout=timedelta(seconds=180),
			
 
				+    )
			
 
				+    logger.info(f"Setting cuda:{local_rank} as main device")
			
 
				+    if not is_main_process():
			
 
				+        for to_mute in loggers:
			
 
				+            to_mute.setLevel(logging.ERROR)
			
 
				+    torch.cuda.set_device(local_rank)
			
 
				+    dist.barrier()
			
--- a/scripts/m4t/train/install_devfair.sh
+++ b/scripts/m4t/train/install_devfair.sh
@@ -0,0 +1,79 @@
 
				+
			
 
				+#  The script is installing seamless_communication (internal) + fairseq2 on AWS cluster.
			
 
				+
			
 
				+set -e
			
 
				+set -x
			
 
				+
			
 
				+echo "Installing Conda"
			
 
				+export TGT=`echo ~/seacom`
			
 
				+rm -rf $TGT
			
 
				+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -qO /tmp/conda.sh
			
 
				+bash /tmp/conda.sh -bp $TGT
			
 
				+export CONDA=$TGT/bin/conda
			
 
				+export CONDA_ACTIVATE=$TGT/bin/activate
			
 
				+export ENV_N=sc_fr2
			
 
				+echo "Next step will take ~15 minutes. Get some coffee" 
			
 
				+module add cuda/11.8
			
 
				+$CONDA create -y -n ${ENV_N} python=3.10 pytorch=2.0.1 pytorch-cuda=11.8 torchvision torchaudio \
			
 
				+             compilers libsndfile==1.0.31 gcc==11.4.0 \
			
 
				+    --strict-channel-priority --override-channels \
			
 
				+    -c pytorch \
			
 
				+    -c nvidia \
			
 
				+    -c conda-forge
			
 
				+
			
 
				+echo "Setting LD_LIBRARY_PATH"
			
 
				+. $CONDA_ACTIVATE activate ${ENV_N}
			
 
				+if [ -z "$CONDA_PREFIX" ]; then 
			
 
				+  echo "CONDA_PREFIX env var is not set!" 
			
 
				+  exit 1
			
 
				+else 
			
 
				+   path=$CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
			
 
				+   echo  "export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH\n"  >> ${path}
			
 
				+fi
			
 
				+. $CONDA_ACTIVATE activate ${ENV_N}  # update env vars
			
 
				+
			
 
				+#  Installing fairseq2.
			
 
				+echo "Installing fairseq2"
			
 
				+if [[ "${I_DONT_PLAN_TO_HACK_FAIRSEQ2:-No}" == "Yes" ]] ; then
			
 
				+pip install fairseq2 \
			
 
				+  --pre --extra-index-url https://fair.pkg.atmeta.com/fairseq2/whl/nightly/pt2.0.1/cu118
			
 
				+else
			
 
				+#  NOTICE: to compile CUDA kernels, you need NVCC. On AWS cluster an easy way would be to get a GPU container:
			
 
				+#  srun -N 1 --gres=gpu:1 --cpus-per-task=20 --partition seamless --time 2400 --pty /bin/bash -l
			
 
				+cd $TGT
			
 
				+git clone --recurse-submodules  git@github.com:facebookresearch/fairseq2.git
			
 
				+pip install -r fairseq2/fairseq2n/python/requirements-build.txt
			
 
				+cd fairseq2
			
 
				+pip install -e .  # it will install public fairseq2n, we rewrite it below
			
 
				+cd fairseq2n
			
 
				+args="-GNinja\
			
 
				+  -DCMAKE_BUILD_TYPE=Release \
			
 
				+  -DCMAKE_CUDA_ARCHITECTURES=80-real;80-virtual\
			
 
				+  -DFAIRSEQ2N_INSTALL_STANDALONE=ON\
			
 
				+  -DFAIRSEQ2N_PERFORM_LTO=ON\
			
 
				+  -DFAIRSEQ2N_TREAT_WARNINGS_AS_ERRORS=OFF\
			
 
				+  -DFAIRSEQ2N_USE_CUDA=ON\
			
 
				+  -DFAIRSEQ2N_BUILD_PYTHON_BINDINGS=ON\
			
 
				+  -DFAIRSEQ2N_PYTHON_DEVEL=OFF"
			
 
				+cmake ${args} -B build
			
 
				+cmake --build build
			
 
				+cd python && pip install .
			
 
				+fi
			
 
				+# Quick test
			
 
				+python -c "from fairseq2n.bindings.data.string import CString as CString"
			
 
				+
			
 
				+# Has to go before fairseq2 to make sure that it will not reinstall fairseq2n
			
 
				+echo "Installing seamless_communication"
			
 
				+cd $TGT
			
 
				+git clone git@github.com:fairinternal/seamless_communication.git
			
 
				+cd seamless_communication
			
 
				+pip install -e .   # editable mode for hacking
			
 
				+
			
 
				+echo "One more time re-install fairseq2n (most propably overriden by seamless_communication)"
			
 
				+cd $TGT/fairseq2/fairseq2n/python
			
 
				+pip install .
			
 
				+
			
 
				+
			
 
				+echo "Finished."
			
 
				+echo "To activate the environment run: . $CONDA_ACTIVATE activate ${ENV_N}"
			
 
				+echo "Location of seamless_communication checkout: $TGT/seamless_communication"
			
--- a/scripts/m4t/train/install_fairaws.sh
+++ b/scripts/m4t/train/install_fairaws.sh
@@ -0,0 +1,90 @@
 
				+
			
 
				+#  The script is installing seamless_communication (internal) + fairseq2 on AWS cluster.
			
 
				+
			
 
				+set -e
			
 
				+set -x
			
 
				+
			
 
				+echo "Installing Conda"
			
 
				+export TGT=`echo ~/seacom_aws_dev`
			
 
				+rm -rf $TGT
			
 
				+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -qO /tmp/conda.sh
			
 
				+bash /tmp/conda.sh -bp $TGT
			
 
				+export CONDA=$TGT/bin/conda
			
 
				+export CONDA_ACTIVATE=$TGT/bin/activate
			
 
				+export ENV_N=sc_fr2_dev
			
 
				+echo "Next step will take ~15 minutes. Get some coffee" 
			
 
				+$CONDA create -y -n ${ENV_N} python=3.10 pytorch=2.0.1 pytorch-cuda=11.8 torchvision torchaudio \
			
 
				+             compilers libsndfile==1.0.31 gcc==11.4.0 \
			
 
				+    --strict-channel-priority --override-channels \
			
 
				+    -c https://aws-ml-conda.s3.us-west-2.amazonaws.com \
			
 
				+    -c pytorch \
			
 
				+    -c nvidia \
			
 
				+    -c conda-forge
			
 
				+
			
 
				+echo "Setting LD_LIBRARY_PATH"
			
 
				+. $CONDA_ACTIVATE activate ${ENV_N}
			
 
				+if [ -z "$CONDA_PREFIX" ]; then 
			
 
				+  echo "CONDA_PREFIX env var is not set!" 
			
 
				+  exit 1
			
 
				+else 
			
 
				+   path=$CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
			
 
				+   echo  "export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH\n"  >> ${path}
			
 
				+fi
			
 
				+. $CONDA_ACTIVATE activate ${ENV_N}  # update env vars
			
 
				+
			
 
				+
			
 
				+#  NOTICE: to compile CUDA kernels, you need NVCC. On AWS cluster an easy way would be to get a GPU container:
			
 
				+#  srun -N 1 --gres=gpu:1 --cpus-per-task=20 --partition seamless --time 2400 --pty /bin/bash -l
			
 
				+
			
 
				+#  Installing fairseq2.
			
 
				+echo "Installing fairseq2"
			
 
				+set -e
			
 
				+rm -rf fairseq2  # wipe existing clones
			
 
				+if [[ "${I_DONT_PLAN_TO_HACK_FAIRSEQ2:-No}" == "Yes" ]] ; then
			
 
				+pip install fairseq2 \
			
 
				+  --pre --extra-index-url https://fair.pkg.atmeta.com/fairseq2/whl/nightly/pt2.0.1/cu118
			
 
				+else
			
 
				+nvidia-smi || echo "to compile CUDA kernels, you need NVCC.\n \
			
 
				+   On AWS cluster an easy way would be to get a GPU container.\n \
			
 
				+   Run smth like 'srun -N 1 --gres=gpu:1 --cpus-per-task=20 --partition seamless --time 2400 --pty /bin/bash -l' \n \
			
 
				+   and continue from "Installing fairseq2" line. \
			
 
				+   Terminating for now."
			
 
				+nvidia-smi || exit 1
			
 
				+cd $TGT
			
 
				+. $CONDA_ACTIVATE activate ${ENV_N}
			
 
				+git clone --recurse-submodules  git@github.com:facebookresearch/fairseq2.git
			
 
				+pip install -r fairseq2/fairseq2n/python/requirements-build.txt
			
 
				+cd fairseq2
			
 
				+pip install -e .  # it will install public fairseq2n, we rewrite it below
			
 
				+cd fairseq2n
			
 
				+args="-GNinja\
			
 
				+  -DCMAKE_BUILD_TYPE=Release \
			
 
				+  -DCMAKE_CUDA_ARCHITECTURES=80-real;80-virtual\
			
 
				+  -DFAIRSEQ2N_INSTALL_STANDALONE=ON\
			
 
				+  -DFAIRSEQ2N_PERFORM_LTO=ON\
			
 
				+  -DFAIRSEQ2N_TREAT_WARNINGS_AS_ERRORS=OFF\
			
 
				+  -DFAIRSEQ2N_USE_CUDA=ON\
			
 
				+  -DFAIRSEQ2N_BUILD_PYTHON_BINDINGS=ON\
			
 
				+  -DFAIRSEQ2N_PYTHON_DEVEL=OFF"
			
 
				+cmake ${args} -B build
			
 
				+cmake --build build
			
 
				+cd python && pip install .
			
 
				+fi
			
 
				+# Quick test
			
 
				+python -c "from fairseq2n.bindings.data.string import CString as CString"
			
 
				+
			
 
				+echo "Installing seamless_communication"
			
 
				+cd $TGT
			
 
				+git clone git@github.com:fairinternal/seamless_communication.git
			
 
				+cd seamless_communication
			
 
				+pip install -e .   # editable mode for hacking
			
 
				+
			
 
				+
			
 
				+echo "One more time re-install fairseq2n (most propably overriden by seamless_communication)"
			
 
				+cd $TGT/fairseq2/fairseq2n/python
			
 
				+pip install .
			
 
				+
			
 
				+
			
 
				+echo "Finished."
			
 
				+echo "To activate the environment run: . $CONDA_ACTIVATE activate ${ENV_N}"
			
 
				+echo "Location of seamless_communication checkout: $TGT/seamless_communication"
			
--- a/scripts/m4t/train/model.py
+++ b/scripts/m4t/train/model.py
@@ -0,0 +1,258 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the BSD-style license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+
			
 
				+import logging
			
 
				+import os
			
 
				+from typing import Dict, Any
			
 
				+
			
 
				+import torch
			
 
				+from m4t_scripts.train.configs import CustomModelParams, ModelConfig
			
 
				+
			
 
				+from seamless_communication.models.unity import (
			
 
				+    UnitYConfig,
			
 
				+    UnitYModel,
			
 
				+    load_unity_model,
			
 
				+    create_unity_model,
			
 
				+)
			
 
				+from seamless_communication.models.unity.loader import load_unity_config
			
 
				+from seamless_communication.models.unity import UnitYT2UConfig
			
 
				+from fairseq2.nn.transformer import TransformerNormOrder
			
 
				+from fairseq2.models.wav2vec2 import Wav2Vec2EncoderConfig
			
 
				+from fairseq2.models.nllb.builder import NllbConfig
			
 
				+from fairseq2.models.utils.checkpoint_loader import convert_model_state_dict
			
 
				+from fairseq2.models.wav2vec2.loader import Wav2Vec2Loader
			
 
				+from seamless_communication.models.unity.loader import UnitYLoader
			
 
				+
			
 
				+from fairseq2.models.nllb.loader import NllbLoader
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+CPU_DEVICE = torch.device("cpu")
			
 
				+
			
 
				+
			
 
				+class ModelBuilder:
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: ModelConfig,
			
 
				+        dtype: torch.dtype = torch.float16,
			
 
				+        device: torch.device = CPU_DEVICE,
			
 
				+    ):
			
 
				+        self.config = config
			
 
				+        self.dtype = dtype
			
 
				+        self.device = device
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _sel_and_upd_prefix(cls, kv: Dict[str, Any], prefix: str, new_prefix: str = "") -> Dict[str, Any]:
			
 
				+        # fmt: off
			
 
				+        return {new_prefix + k[len(prefix):]: v for k, v in kv.items() if k.startswith(prefix)}
			
 
				+        # fmt: on
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _load_pretrained_w2v2_encoder(cls, model: UnitYModel, checkpoint_path: str) -> None:
			
 
				+        """Load w2v2 encoder model trained in fairseq1"""
			
 
				+        logger.info(f"Loading w2v2 weights from {checkpoint_path}")
			
 
				+        state_dict = torch.load(checkpoint_path)["model"]
			
 
				+        key_map = Wav2Vec2Loader._fairseq_key_map()
			
 
				+        key_map.update(
			
 
				+            {
			
 
				+                r"^encoder.layers\.([0-9]+)\.conv_module.batch_norm.": r"encoder.layers.\1.conv.batch_norm.",
			
 
				+                r"^encoder.layers\.([0-9]+)\.conv_module.depthwise_conv.": r"encoder.layers.\1.conv.depthwise_conv.",
			
 
				+                r"^encoder.layers\.([0-9]+)\.conv_module.pointwise_conv([0-9]+)\.": (
			
 
				+                    r"encoder.layers.\1.conv.pointwise_conv\2."
			
 
				+                ),
			
 
				+                r"^encoder.layers\.([0-9]+)\.conv_module.layer_norm.": r"encoder.layers.\1.conv_layer_norm.",
			
 
				+                r"^encoder.layers\.([0-9]+)\.ffn([0-9]+)\.layer_norm.": r"encoder.layers.\1.ffn\2_layer_norm.",
			
 
				+                r"^encoder.layers\.([0-9]+)\.ffn([0-9]+)\.w_1.": r"encoder.layers.\1.ffn\2.inner_proj.",
			
 
				+                r"^encoder.layers\.([0-9]+)\.ffn([0-9]+)\.w_2.": r"encoder.layers.\1.ffn\2.output_proj.",
			
 
				+                r"^encoder.layers\.([0-9]+)\.self_attn.linear_k\.": r"encoder.layers.\1.self_attn.k_proj.",
			
 
				+                r"^encoder.layers\.([0-9]+)\.self_attn.linear_q\.": r"encoder.layers.\1.self_attn.q_proj.",
			
 
				+                r"^encoder.layers\.([0-9]+)\.self_attn.linear_v\.": r"encoder.layers.\1.self_attn.v_proj.",
			
 
				+                r"^encoder.layers\.([0-9]+)\.self_attn.linear_out\.": r"encoder.layers.\1.self_attn.output_proj.",
			
 
				+                r"^encoder.layers\.([0-9]+)\.self_attn.linear_pos.weight": (
			
 
				+                    r"encoder.layers.\1.self_attn.sdpa.r_proj.weight"
			
 
				+                ),
			
 
				+                r"^encoder.layers\.([0-9]+)\.self_attn.pos_bias_u": r"encoder.layers.\1.self_attn.sdpa.u_bias",
			
 
				+                r"^encoder.layers\.([0-9]+)\.self_attn.pos_bias_v": r"encoder.layers.\1.self_attn.sdpa.v_bias",
			
 
				+                # overrides existing rule
			
 
				+                r"^encoder\.layers\.([0-9]+)\.final_layer_norm\.": r"encoder.layers.\1.layer_norm.",
			
 
				+            }
			
 
				+        )
			
 
				+        state_dict = convert_model_state_dict(state_dict=state_dict, key_map=key_map)
			
 
				+        # w2v2_encoder in fairseq2 have encoder layer_norm set to None
			
 
				+        for rm_key in ["encoder.layer_norm.bias", "encoder.layer_norm.weight"]:
			
 
				+            del state_dict[rm_key]
			
 
				+        enc_state_dict = cls._sel_and_upd_prefix(kv=state_dict, prefix="encoder.")
			
 
				+        model.speech_encoder.inner.load_state_dict(enc_state_dict, strict=True)  # type: ignore
			
 
				+        logger.info(f"Loaded w2v2 encoder from {checkpoint_path}")
			
 
				+
			
 
				+        enc_fronted_state_dict = cls._sel_and_upd_prefix(kv=state_dict, prefix="encoder_frontend.")  # noqa
			
 
				+        # TODO: reconcile discrepancies between fr1 and fr2 model designs
			
 
				+        #  fr1-based w2v2 checkpoints with conv positional encoders use relpos self attention
			
 
				+        #   this is not compatible with the fr2 model design
			
 
				+        # model.speech_encoder_frontend.load_state_dict(enc_fronted_state_dict)
			
 
				+        # logger.info(f"Loaded w2v2 encoder frontend from {checkpoint_path}")
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _load_pretrained_s2t_decoder(cls, model: UnitYModel, checkpoint_path: str) -> None:
			
 
				+        """Load NLLB decoder trained in fairseq1"""
			
 
				+        logger.info(f"Loading s2t decoder weights from {checkpoint_path}")
			
 
				+        try:
			
 
				+            state_dict = torch.load(checkpoint_path)["model"]
			
 
				+        except ModuleNotFoundError:
			
 
				+            logger.info("If seeing `No module named 'omegaconf'`, run `pip install omegaconf`")
			
 
				+            raise
			
 
				+        decoder_prefix = "decoder."
			
 
				+        shared_state_dict = cls._sel_and_upd_prefix(kv=state_dict, prefix="shared_decoder.", new_prefix=decoder_prefix)
			
 
				+        shared_state_dict = convert_model_state_dict(
			
 
				+            state_dict=shared_state_dict, key_map=NllbLoader._fairseq_key_map()
			
 
				+        )
			
 
				+        for rm_key in ["decoder.embed_positions._float_tensor", "decoder.version"]:
			
 
				+            del shared_state_dict[rm_key]
			
 
				+        decoder_state = cls._sel_and_upd_prefix(kv=shared_state_dict, prefix=decoder_prefix, new_prefix="")
			
 
				+        frontend_state = cls._sel_and_upd_prefix(kv=shared_state_dict, prefix="decoder_frontend.", new_prefix="")
			
 
				+        proj_state = cls._sel_and_upd_prefix(kv=shared_state_dict, prefix="final_proj.", new_prefix="")
			
 
				+        model.text_decoder_frontend.load_state_dict(frontend_state, strict=True)
			
 
				+        logger.info(f"Loaded s2t decoder frontend weights from {checkpoint_path}")
			
 
				+        model.text_decoder.load_state_dict(decoder_state, strict=True)
			
 
				+        logger.info(f"Loaded s2t decoder weights from {checkpoint_path}")
			
 
				+        model.final_proj.load_state_dict(proj_state, strict=True)
			
 
				+        logger.info(f"Loaded s2t decoder final_proj weights from {checkpoint_path}")
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _load_pretrained_t2u(cls, model: UnitYModel, model_config: UnitYConfig, checkpoint_path: str) -> None:
			
 
				+        logger.info(f"Loading t2u weights from {checkpoint_path}")
			
 
				+        t2u_model = model.t2u_model
			
 
				+        assert t2u_model is not None
			
 
				+        try:
			
 
				+            state_dict = torch.load(checkpoint_path)["model"]
			
 
				+        except ModuleNotFoundError:
			
 
				+            logger.info("If seeing `No module named 'omegaconf'`, run `pip install omegaconf`")
			
 
				+            raise
			
 
				+        state_dict = {k.replace("encoder.", "synthesizer_encoder."): v for k, v in state_dict.items()}
			
 
				+        state_dict = convert_model_state_dict(
			
 
				+            state_dict=state_dict, key_map=UnitYLoader._fairseq_key_map(config=model_config)
			
 
				+        )
			
 
				+        t2u_state_dict = cls._sel_and_upd_prefix(kv=state_dict, prefix="t2u_model.", new_prefix="")
			
 
				+        t2u_model.load_state_dict(t2u_state_dict)
			
 
				+        logger.info(f"Loaded t2u weights from {checkpoint_path}")
			
 
				+
			
 
				+    def build_model(
			
 
				+        self,
			
 
				+    ) -> UnitYModel:
			
 
				+        config = self.config
			
 
				+        logger.info("Initializing model")
			
 
				+        if config.from_model is not None:
			
 
				+            logger.info(f"Loading model and weights from `{config.from_model}`")
			
 
				+            return load_unity_model(config.from_model, device=self.device, dtype=self.dtype)
			
 
				+
			
 
				+        if config.from_model_config is not None:
			
 
				+            logger.info(f"Loading Unity config from `{config.from_model_config}`")
			
 
				+            model_config = load_unity_config(config.from_model_config)
			
 
				+        elif config.custom_params is not None:
			
 
				+            logger.info("Creating custom Unity config")
			
 
				+            model_config = self._build_custom_model_config()
			
 
				+        else:
			
 
				+            raise ValueError("One of params from_model, from_model_config or custom_params has to be set")
			
 
				+        logger.info("Building model")
			
 
				+        model = create_unity_model(config=model_config, dtype=self.dtype, device=self.device)
			
 
				+
			
 
				+        if self.config.pretrained_w2v2_path is not None:
			
 
				+            self._load_pretrained_w2v2_encoder(model, self.config.pretrained_w2v2_path)
			
 
				+
			
 
				+        if self.config.pretrained_s2t_decoder_path is not None:
			
 
				+            self._load_pretrained_s2t_decoder(model, self.config.pretrained_s2t_decoder_path)
			
 
				+
			
 
				+        if self.config.pretrained_t2u_path is not None:
			
 
				+            self._load_pretrained_t2u(model, model_config, self.config.pretrained_t2u_path)
			
 
				+
			
 
				+        return model
			
 
				+
			
 
				+    def _build_custom_model_config(self) -> UnitYConfig:
			
 
				+        config = self.config.custom_params
			
 
				+        assert config is not None
			
 
				+        return UnitYConfig(
			
 
				+            model_dim=config.model_embed_dim,
			
 
				+            w2v2_encoder_config=Wav2Vec2EncoderConfig(
			
 
				+                model_dim=config.model_embed_dim,
			
 
				+                max_seq_len=4096,
			
 
				+                feature_dim=160,
			
 
				+                use_fbank=True,
			
 
				+                first_pass_dropout_p=0.0,
			
 
				+                layer_norm_features=config.w2v2_encoder_layers_layernorm_features,
			
 
				+                feature_extractor_layer_descs=[],
			
 
				+                feature_extractor_bias=False,
			
 
				+                feature_extractor_layer_norm_convs=False,
			
 
				+                feature_grad_scale=0,
			
 
				+                num_fbank_channels=80,
			
 
				+                fbank_stride=2,
			
 
				+                sample_fbank_every_k=1,
			
 
				+                pos_encoder_type=config.w2v2_pos_encoder_type,
			
 
				+                pos_encoder_depth=config.w2v2_pos_encoder_depth,
			
 
				+                pos_conv_kernel_size=config.w2v2_pos_conv_kernel_size,
			
 
				+                num_pos_conv_groups=config.w2v2_num_pos_conv_groups,
			
 
				+                use_conformer=config.w2v2_encoder_layers_use_conformer,
			
 
				+                num_encoder_layers=config.w2v2_encoder_layers,
			
 
				+                num_encoder_attn_heads=16,
			
 
				+                ffn_inner_dim=config.model_embed_dim * 4,
			
 
				+                dropout_p=0.0,
			
 
				+                attn_dropout_p=0.0,
			
 
				+                layer_drop_p=0.0,
			
 
				+                norm_order=TransformerNormOrder.POST,
			
 
				+                depthwise_conv_kernel_size=31,
			
 
				+            ),
			
 
				+            mt_model_config=NllbConfig(
			
 
				+                model_dim=config.model_embed_dim,
			
 
				+                max_seq_len=1024,
			
 
				+                vocabulary_size=config.nllb_vocabulary_size,  # num_tokens + langs + spec symbols
			
 
				+                pad_idx=0,
			
 
				+                num_encoder_layers=config.nllb_encoder_layers,
			
 
				+                num_decoder_layers=config.nllb_decoder_layers,
			
 
				+                num_encoder_attn_heads=16,
			
 
				+                num_decoder_attn_heads=16,
			
 
				+                ffn_inner_dim=config.model_embed_dim * 8,
			
 
				+                dropout_p=0.1,
			
 
				+            ),
			
 
				+            t2u_config=UnitYT2UConfig(
			
 
				+                model_dim=config.model_embed_dim,
			
 
				+                unit_max_seq_len=2048,
			
 
				+                unit_vocabulary_size=config.unit_vocabulary_size,
			
 
				+                unit_pad_idx=1,
			
 
				+                num_encoder_layers=config.t2u_encoder_layers,
			
 
				+                num_decoder_layers=config.t2u_decoder_layers,
			
 
				+                nar_decoder_frontend_config=None,
			
 
				+                nar_decoder_config=None,
			
 
				+                num_encoder_attn_heads=16,
			
 
				+                num_decoder_attn_heads=16,
			
 
				+                ffn_inner_dim=config.model_embed_dim * 8,
			
 
				+                dropout_p=0.1,
			
 
				+            ),
			
 
				+            use_text_encoder=True,
			
 
				+            use_conformer_adaptor=False,
			
 
				+            num_adaptor_layers=1,
			
 
				+            adaptor_kernel_size=8,
			
 
				+            adaptor_stride=8,
			
 
				+            adaptor_layer_norm=True,
			
 
				+            adaptor_dropout_p=0.1,
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    logging.basicConfig(
			
 
				+        level=logging.INFO,
			
 
				+        format=f"%(asctime)s %(levelname)s -- %(name)s.{os.getpid()}: %(message)s",
			
 
				+    )
			
 
				+    config = ModelConfig(
			
 
				+        custom_params=CustomModelParams(
			
 
				+            nllb_vocabulary_size=256103,
			
 
				+        ),
			
 
				+        pretrained_w2v2_path="/fsx-ust/spopuri/datasets/PT_CKPT/w2v2/w2vbert2rpq_600m_al5.pt",
			
 
				+        pretrained_s2t_decoder_path="/fsx-ust/spopuri/datasets/PT_CKPT/S2T/S2T_M4T_V1_V1_cleaned.pt",
			
 
				+        pretrained_t2u_path="/fsx-ust/spopuri/datasets/PT_CKPT/T2U/V5_10K_p2_14_80K.pt",
			
 
				+    )
			
 
				+    builder = ModelBuilder(config=config)
			
 
				+    model = ModelBuilder(config=config).build_model()
			
--- a/scripts/m4t/train/recipes/asr_small.yaml
+++ b/scripts/m4t/train/recipes/asr_small.yaml
@@ -0,0 +1,97 @@
 
				+eval_data:
			
 
				+  audio:
			
 
				+    audio_root_dir: /fsx-ust/data/audio_zips/
			
 
				+    fbanks_num_mel_bins: 80
			
 
				+    fbanks_standardize_audio: true
			
 
				+    fbanks_waveform_scale: 32768
			
 
				+  fbank_feats_pad_idx: 0
			
 
				+  manifest_list: dev_asr_only_aggregated_adapted
			
 
				+  manifest_list_path: null
			
 
				+  manifest_path_prefix: /data/home/mavlyutov/s2t_ondevice/
			
 
				+  max_seconds_per_input_audio: 15
			
 
				+  fixed_batch_size: 40
			
 
				+  max_tgt_text_tokens_per_batch: 1000
			
 
				+  max_tgt_text_tokens_per_sample: 300
			
 
				+  max_units_per_sample: 1500
			
 
				+  num_threads: 5
			
 
				+  prefech_batches: null 
			
 
				+  prepend_tgt_lang_tag: true
			
 
				+  shuffle_window: 1000
			
 
				+  text_tokenization:
			
 
				+    from_model: null
			
 
				+    langtoks:
			
 
				+    - eng
			
 
				+    - rus
			
 
				+    - hin
			
 
				+    - por
			
 
				+    - spa
			
 
				+    spm_path: /data/home/mavlyutov/s2t_ondevice/vocab20k/5_5_20k.model
			
 
				+  unit_tokenization:
			
 
				+    from_model: seamlessM4T_large
			
 
				+    langtoks: null
			
 
				+    num_units: null
			
 
				+  unit_tokenizer_name: seamlessM4T_large
			
 
				+model:
			
 
				+  custom_params:
			
 
				+    model_embed_dim: 768
			
 
				+    nllb_decoder_layers: 3
			
 
				+    nllb_encoder_layers: 1
			
 
				+    nllb_vocabulary_size: 256102
			
 
				+    t2u_decoder_layers: 1
			
 
				+    t2u_encoder_layers: 1
			
 
				+    unit_vocabulary_size: 10082
			
 
				+    w2v2_encoder_layers: 6
			
 
				+    w2v2_encoder_layers_layernorm_features: false
			
 
				+    w2v2_encoder_layers_use_conformer: true
			
 
				+    w2v2_num_pos_conv_groups: 0
			
 
				+    w2v2_pos_conv_kernel_size: 0
			
 
				+    w2v2_pos_encoder_depth: 0
			
 
				+    w2v2_pos_encoder_type: relative
			
 
				+  from_model: null
			
 
				+  from_model_config: null
			
 
				+  pretrained_s2t_decoder_path: null
			
 
				+  pretrained_t2u_path: null
			
 
				+  pretrained_w2v2_path: null
			
 
				+train_data:
			
 
				+  audio:
			
 
				+    audio_root_dir: /fsx-ust/data/audio_zips/
			
 
				+    fbanks_num_mel_bins: 80
			
 
				+    fbanks_standardize_audio: true
			
 
				+    fbanks_waveform_scale: 32768
			
 
				+  fbank_feats_pad_idx: 0
			
 
				+  manifest_list: train_asr_only_aggregated_5_dial_filtered_adapted
			
 
				+  manifest_list_path: null
			
 
				+  manifest_path_prefix: /data/home/mavlyutov/s2t_ondevice/
			
 
				+  max_seconds_per_input_audio: 15
			
 
				+  fixed_batch_size: 40
			
 
				+  max_tgt_text_tokens_per_batch: 600
			
 
				+  max_tgt_text_tokens_per_sample: 300
			
 
				+  max_units_per_sample: 1500
			
 
				+  num_threads: 4 
			
 
				+  prefech_batches: null 
			
 
				+  prepend_tgt_lang_tag: true
			
 
				+  shuffle_window: 1000 
			
 
				+  text_tokenization:
			
 
				+    from_model: null
			
 
				+    langtoks:
			
 
				+    - eng
			
 
				+    - rus
			
 
				+    - hin
			
 
				+    - por
			
 
				+    - spa
			
 
				+    spm_path: /data/home/mavlyutov/s2t_ondevice/vocab20k/5_5_20k.model
			
 
				+  unit_tokenization:
			
 
				+    from_model: seamlessM4T_large
			
 
				+    langtoks: null
			
 
				+    num_units: null
			
 
				+  unit_tokenizer_name: seamlessM4T_large
			
 
				+training:
			
 
				+  eval_steps: 5000 
			
 
				+  float_dtype: fp32
			
 
				+  label_smoothing: 0.2
			
 
				+  learning_rate: 0.0001
			
 
				+  log_steps:  200 
			
 
				+  max_epochs: 100
			
 
				+  patience: 10
			
 
				+  start_learning_rate: 1.0e-07
			
 
				+  warmup_steps: 1000
			
--- a/scripts/m4t/train/recipes/asr_small_wh_transc.yaml
+++ b/scripts/m4t/train/recipes/asr_small_wh_transc.yaml
@@ -0,0 +1,97 @@
 
				+eval_data:
			
 
				+  audio:
			
 
				+    audio_root_dir: /fsx-ust/data/audio_zips/
			
 
				+    fbanks_num_mel_bins: 80
			
 
				+    fbanks_standardize_audio: true
			
 
				+    fbanks_waveform_scale: 32768
			
 
				+  fbank_feats_pad_idx: 0
			
 
				+  manifest_list: dev_asr_only_aggregated_adapted
			
 
				+  manifest_list_path: null
			
 
				+  manifest_path_prefix: /data/home/mavlyutov/s2t_ondevice/
			
 
				+  max_seconds_per_input_audio: 15
			
 
				+  fixed_batch_size: 40
			
 
				+  max_tgt_text_tokens_per_batch: 1000
			
 
				+  max_tgt_text_tokens_per_sample: 300
			
 
				+  max_units_per_sample: 1500
			
 
				+  num_threads: 5
			
 
				+  prefech_batches: null 
			
 
				+  prepend_tgt_lang_tag: true
			
 
				+  shuffle_window: 1000
			
 
				+  text_tokenization:
			
 
				+    from_model: null
			
 
				+    langtoks:
			
 
				+    - eng
			
 
				+    - rus
			
 
				+    - hin
			
 
				+    - por
			
 
				+    - spa
			
 
				+    spm_path: /data/home/mavlyutov/s2t_ondevice/vocab20k/5_5_20k.model
			
 
				+  unit_tokenization:
			
 
				+    from_model: seamlessM4T_large
			
 
				+    langtoks: null
			
 
				+    num_units: null
			
 
				+  unit_tokenizer_name: seamlessM4T_large
			
 
				+model:
			
 
				+  custom_params:
			
 
				+    model_embed_dim: 768
			
 
				+    nllb_decoder_layers: 3
			
 
				+    nllb_encoder_layers: 1
			
 
				+    nllb_vocabulary_size: 256102
			
 
				+    t2u_decoder_layers: 1
			
 
				+    t2u_encoder_layers: 1
			
 
				+    unit_vocabulary_size: 10082
			
 
				+    w2v2_encoder_layers: 6
			
 
				+    w2v2_encoder_layers_layernorm_features: false
			
 
				+    w2v2_encoder_layers_use_conformer: true
			
 
				+    w2v2_num_pos_conv_groups: 0
			
 
				+    w2v2_pos_conv_kernel_size: 0
			
 
				+    w2v2_pos_encoder_depth: 0
			
 
				+    w2v2_pos_encoder_type: relative
			
 
				+  from_model: null
			
 
				+  from_model_config: null
			
 
				+  pretrained_s2t_decoder_path: null
			
 
				+  pretrained_t2u_path: null
			
 
				+  pretrained_w2v2_path: null
			
 
				+train_data:
			
 
				+  audio:
			
 
				+    audio_root_dir: /fsx-ust/data/audio_zips/
			
 
				+    fbanks_num_mel_bins: 80
			
 
				+    fbanks_standardize_audio: true
			
 
				+    fbanks_waveform_scale: 32768
			
 
				+  fbank_feats_pad_idx: 0
			
 
				+  manifest_list: train_asr_only_aggregated_5_dial_filtered_adapted_wh_transc
			
 
				+  manifest_list_path: null
			
 
				+  manifest_path_prefix: /data/home/mavlyutov/s2t_ondevice/
			
 
				+  max_seconds_per_input_audio: 15
			
 
				+  fixed_batch_size: 40
			
 
				+  max_tgt_text_tokens_per_batch: 600
			
 
				+  max_tgt_text_tokens_per_sample: 300
			
 
				+  max_units_per_sample: 1500
			
 
				+  num_threads: 4 
			
 
				+  prefech_batches: null 
			
 
				+  prepend_tgt_lang_tag: true
			
 
				+  shuffle_window: 1000 
			
 
				+  text_tokenization:
			
 
				+    from_model: null
			
 
				+    langtoks:
			
 
				+    - eng
			
 
				+    - rus
			
 
				+    - hin
			
 
				+    - por
			
 
				+    - spa
			
 
				+    spm_path: /data/home/mavlyutov/s2t_ondevice/vocab20k/5_5_20k.model
			
 
				+  unit_tokenization:
			
 
				+    from_model: seamlessM4T_large
			
 
				+    langtoks: null
			
 
				+    num_units: null
			
 
				+  unit_tokenizer_name: seamlessM4T_large
			
 
				+training:
			
 
				+  eval_steps: 1000 
			
 
				+  float_dtype: fp32
			
 
				+  label_smoothing: 0.2
			
 
				+  learning_rate: 0.0001
			
 
				+  log_steps:  50 
			
 
				+  max_epochs: 100
			
 
				+  patience: 10
			
 
				+  start_learning_rate: 1.0e-07
			
 
				+  warmup_steps: 1000
			
--- a/scripts/m4t/train/recipes/large_M4T_v1.yaml
+++ b/scripts/m4t/train/recipes/large_M4T_v1.yaml
@@ -0,0 +1,74 @@
 
				+eval_data:
			
 
				+  audio:
			
 
				+    audio_root_dir: /fsx-ust/data/audio_zips/
			
 
				+    fbanks_num_mel_bins: 80
			
 
				+    fbanks_standardize_audio: true
			
 
				+    fbanks_waveform_scale: 32768
			
 
				+  fbank_feats_pad_idx: 0
			
 
				+  manifest_list: dev_fleurs_arb-eng,dev_fleurs_ben-eng,dev_fleurs_hin-eng,dev_fleurs_ind-eng,dev_fleurs_ita-eng,dev_fleurs_jpn-eng,dev_fleurs_por-eng,dev_fleurs_rus-eng,dev_fleurs_swh-eng,dev_fleurs_tha-eng,dev_fleurs_tur-eng,dev_fleurs_urd-eng,dev_fleurs_vie-eng,dev_fleurs_spa-eng,dev_fleurs_eng-arb,dev_fleurs_eng-ben,dev_fleurs_eng-hin,dev_fleurs_eng-ind,dev_fleurs_eng-ita,dev_fleurs_eng-jpn,dev_fleurs_eng-por,dev_fleurs_eng-rus,dev_fleurs_eng-swh,dev_fleurs_eng-tha,dev_fleurs_eng-tur,dev_fleurs_eng-urd,dev_fleurs_eng-vie,dev_fleurs_eng-spa
			
 
				+  manifest_list_path: null
			
 
				+  manifest_path_prefix: /fsx-ust/spopuri/datasets/S2ST/V1/M4T_V1_phase2/primary/
			
 
				+  max_seconds_per_input_audio: 150
			
 
				+  fixed_batch_size: 40
			
 
				+  max_tgt_text_tokens_per_batch: null
			
 
				+  max_tgt_text_tokens_per_sample: 3000
			
 
				+  max_units_per_sample: 1500
			
 
				+  num_threads: 10 
			
 
				+  prefech_batches: 10
			
 
				+  prepend_tgt_lang_tag: true
			
 
				+  shuffle_window: 1000
			
 
				+  text_tokenization:
			
 
				+    from_model: seamlessM4T_large
			
 
				+    spm_path: null
			
 
				+    langtoks: null
			
 
				+  unit_tokenization:
			
 
				+    from_model: seamlessM4T_large
			
 
				+    langtoks: null
			
 
				+    num_units: null
			
 
				+  unit_tokenizer_name: seamlessM4T_large
			
 
				+model:
			
 
				+  custom_params:
			
 
				+    nllb_vocabulary_size: 256103
			
 
				+  from_model: null
			
 
				+  from_model_config: null
			
 
				+  pretrained_s2t_decoder_path: /fsx-ust/spopuri/datasets/PT_CKPT/S2T/S2T_M4T_V1_V1_cleaned.pt
			
 
				+  pretrained_t2u_path: /fsx-ust/spopuri/datasets/PT_CKPT/T2U/V5_10K_p2_14_80K.pt 
			
 
				+  pretrained_w2v2_path: /fsx-ust/spopuri/datasets/PT_CKPT/w2v2/w2vbert2rpq_600m_al5.pt
			
 
				+train_data:
			
 
				+  audio:
			
 
				+    audio_root_dir: /fsx-ust/data/audio_zips/
			
 
				+    fbanks_num_mel_bins: 80
			
 
				+    fbanks_standardize_audio: true
			
 
				+    fbanks_waveform_scale: 32768
			
 
				+  fbank_feats_pad_idx: 0
			
 
				+  manifest_list: null 
			
 
				+  manifest_list_path: /data/home/mavlyutov/train_configs/m4t_v1_train_manifests.txt
			
 
				+  manifest_path_prefix: /fsx-ust/spopuri/datasets/S2ST/V1/M4T_V1_phase2/primary 
			
 
				+  max_seconds_per_input_audio: 15
			
 
				+  fixed_batch_size: null 
			
 
				+  max_tgt_text_tokens_per_batch: 600
			
 
				+  max_tgt_text_tokens_per_sample: 300
			
 
				+  max_units_per_sample: 1500
			
 
				+  num_threads: 10 
			
 
				+  prefech_batches: 10
			
 
				+  prepend_tgt_lang_tag: true
			
 
				+  shuffle_window: 1000
			
 
				+  text_tokenization:
			
 
				+    from_model: seamlessM4T_large
			
 
				+    spm_path: null
			
 
				+    langtoks: null
			
 
				+  unit_tokenization:
			
 
				+    from_model: seamlessM4T_large
			
 
				+    langtoks: null
			
 
				+    num_units: null
			
 
				+  unit_tokenizer_name: seamlessM4T_large
			
 
				+training:
			
 
				+  eval_steps: 5000 
			
 
				+  float_dtype: fp16
			
 
				+  label_smoothing: 0.2
			
 
				+  learning_rate: 0.0001
			
 
				+  log_steps: 200 
			
 
				+  max_epochs: 100
			
 
				+  patience: 10
			
 
				+  start_learning_rate: 1.0e-07
			
 
				+  warmup_steps: 1000
			
--- a/scripts/m4t/train/recipes/m4t_v1_train_manifests.txt
+++ b/scripts/m4t/train/recipes/m4t_v1_train_manifests.txt
@@ -0,0 +1,94 @@
 
				+train_mc_eng-arb
			
 
				+train_mc_eng-ita
			
 
				+train_mc_eng-por
			
 
				+train_mc_eng-rus
			
 
				+train_mc_eng-spa
			
 
				+train_mc_eng-tur
			
 
				+train_mc_eng-vie
			
 
				+train_cv11_eng-arb
			
 
				+train_cv11_eng-ben
			
 
				+train_cv11_eng-hin
			
 
				+train_cv11_eng-ind
			
 
				+train_cv11_eng-ita
			
 
				+train_cv11_eng-jpn
			
 
				+train_cv11_eng-por
			
 
				+train_cv11_eng-rus
			
 
				+train_cv11_eng-spa
			
 
				+train_cv11_eng-swh
			
 
				+train_cv11_eng-tha
			
 
				+train_cv11_eng-tur
			
 
				+train_cv11_eng-urd
			
 
				+train_cv11_eng-vie
			
 
				+train_epst_eng-ita
			
 
				+train_epst_eng-por
			
 
				+train_epst_eng-spa
			
 
				+train_licds2s_eng-vie
			
 
				+train_cv12_arb-eng
			
 
				+train_masc_arb-eng
			
 
				+train_mtedx_arb-eng
			
 
				+train_shaip_arb-eng
			
 
				+train_slr108_arb-eng
			
 
				+train_css10_spa-eng
			
 
				+train_cv12_spa-eng
			
 
				+train_epst_spa-eng
			
 
				+train_mls_spa-eng
			
 
				+train_mtedx_spa-eng
			
 
				+train_slr108_spa-eng
			
 
				+train_vpsr_spa-eng
			
 
				+train_vpst_spa-eng
			
 
				+train_cv12_hin-eng
			
 
				+train_slr118_hin-eng
			
 
				+train_speechocean_hin-eng
			
 
				+train_cv12_ind-eng
			
 
				+train_mdata-c_ind-eng
			
 
				+train_mdata-s_ind-eng
			
 
				+train_shaip_ind-eng
			
 
				+train_speechocean_ind-eng
			
 
				+train_tt221213_ind-eng
			
 
				+train_bbl_tur-eng
			
 
				+train_cv12_tur-eng
			
 
				+train_mdata-s_tur-eng
			
 
				+train_slr108_tur-eng
			
 
				+train_speechocean_tur-eng
			
 
				+train_tt221213_tur-eng
			
 
				+train_bbl_swh-eng
			
 
				+train_cv12_swh-eng
			
 
				+train_shaip_swh-eng
			
 
				+train_css10_rus-eng
			
 
				+train_cv12_rus-eng
			
 
				+train_mtedx_rus-eng
			
 
				+train_ruls_rus-eng
			
 
				+train_bbl_ben-eng
			
 
				+train_bbl_vie-eng
			
 
				+train_css10_jpn-eng
			
 
				+train_epst_ita-eng
			
 
				+train_epst_por-eng
			
 
				+train_fosd_vie-eng
			
 
				+train_kokoro_jpn-eng
			
 
				+train_mdata-s_jpn-eng
			
 
				+train_mdata-s_tha-eng
			
 
				+train_mls_ita-eng
			
 
				+train_mls_por-eng
			
 
				+train_mtedx_ita-eng
			
 
				+train_mtedx_por-eng
			
 
				+train_reazonspeech-m_jpn-eng
			
 
				+train_shaip_ben-eng
			
 
				+train_shaip_jpn-eng
			
 
				+train_shaip_tha-eng
			
 
				+train_shaip_vie-eng
			
 
				+train_slr53_ben-eng
			
 
				+train_speechocean_urd-eng
			
 
				+train_tt221213_jpn-eng
			
 
				+train_tt221213_tha-eng
			
 
				+train_vivos_vie-eng
			
 
				+train_vpsr_ita-eng
			
 
				+train_vpst_ita-eng
			
 
				+train_cv12_ben-eng
			
 
				+train_cv12_ita-eng
			
 
				+train_cv12_jpn-eng
			
 
				+train_cv12_por-eng
			
 
				+train_cv12_tha-eng
			
 
				+train_cv12_urd-eng
			
 
				+train_cv12_vie-eng
			
 
				+train_speechocean_urd_2-eng
			
 
				+train_licds2s_vie-eng
			
--- a/scripts/m4t/train/run_training.py
+++ b/scripts/m4t/train/run_training.py
@@ -0,0 +1,118 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+import argparse
			
 
				+import logging
			
 
				+import os
			
 
				+import platform
			
 
				+import shutil
			
 
				+import time
			
 
				+from pathlib import Path
			
 
				+from typing import List
			
 
				+
			
 
				+import torch
			
 
				+import yaml
			
 
				+from m4t_scripts.train import dataloader as _dataloader
			
 
				+from m4t_scripts.train import dist_utils
			
 
				+from m4t_scripts.train import model as _model
			
 
				+from m4t_scripts.train import trainer as _trainer
			
 
				+from m4t_scripts.train.configs import WorkflowParams
			
 
				+
			
 
				+logging_format = f"%(asctime)s - {platform.node()} - %(process)s - %(levelname)s - %(name)s: %(message)s"
			
 
				+logging.basicConfig(
			
 
				+    level=logging.INFO,
			
 
				+    format=logging_format,
			
 
				+)
			
 
				+
			
 
				+logger = logging.getLogger("train")
			
 
				+
			
 
				+
			
 
				+def init_parser() -> argparse.ArgumentParser:
			
 
				+    parser = argparse.ArgumentParser(description="Run M4T training")
			
 
				+    parser.add_argument(
			
 
				+        "--wd",
			
 
				+        type=Path,
			
 
				+        required=True,
			
 
				+        help="Work directory, where logs, checkpoints and core dumps will be stored",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--params",
			
 
				+        type=Path,
			
 
				+        required=True,
			
 
				+        help="Config with training parameters",
			
 
				+    )
			
 
				+    return parser
			
 
				+
			
 
				+
			
 
				+def run_training(parameters: WorkflowParams, work_dir: str, checkpoint_dir: str) -> None:
			
 
				+    logger.info(f"Workflow params: {parameters}")
			
 
				+    rank, world_size = dist_utils.get_rank(), dist_utils.get_world_size()
			
 
				+    logger.info(f"Rank: {rank}, world_size: {world_size}")
			
 
				+    assert torch.cuda.device_count() > 0, "GPU is not available"
			
 
				+    device = torch.device("cuda")
			
 
				+    float_dtype = _trainer.UnitYTrainer._get_float_dtype(parameters.training.float_dtype)
			
 
				+    logger.info(f"Device: {device}, float dtype: {float_dtype}")
			
 
				+    model = _model.ModelBuilder(config=parameters.model, dtype=float_dtype, device=device).build_model()
			
 
				+    logger.info(f"Model: {model}")
			
 
				+    train_data = _dataloader.UnityDataLoader(
			
 
				+        config=parameters.train_data, rank=rank, world_size=world_size, target_device=device, float_dtype=float_dtype
			
 
				+    )
			
 
				+    eval_data = _dataloader.UnityDataLoader(
			
 
				+        config=parameters.eval_data, rank=rank, world_size=world_size, target_device=device, float_dtype=float_dtype
			
 
				+    )
			
 
				+    trainer = _trainer.UnitYTrainer(
			
 
				+        model=model,
			
 
				+        params=parameters.training,
			
 
				+        train_data_loader=train_data,
			
 
				+        eval_data_loader=eval_data,
			
 
				+        chck_save_dir=checkpoint_dir,
			
 
				+        device=device,
			
 
				+    )
			
 
				+    trainer.run()
			
 
				+
			
 
				+
			
 
				+def get_loggers() -> List[logging.Logger]:
			
 
				+    return [logger, _trainer.logger, _dataloader.logger, _model.logger, dist_utils.logger]
			
 
				+
			
 
				+
			
 
				+def set_file_output_for_loggers(log_filename: str) -> None:
			
 
				+    handler = logging.FileHandler(filename=log_filename, mode="a", delay=False)
			
 
				+    formatter = logging.Formatter(logging_format)
			
 
				+    handler.setFormatter(formatter)
			
 
				+    for logger in get_loggers():
			
 
				+        logger.handlers.append(handler)
			
 
				+
			
 
				+
			
 
				+def main() -> None:
			
 
				+    args = init_parser().parse_args()
			
 
				+    dist_utils.init_distributed(get_loggers())
			
 
				+    is_master = dist_utils.is_main_process()
			
 
				+    with open(args.params, "r") as fp_in:
			
 
				+        parameters = WorkflowParams.deserialize(yaml.load(fp_in, Loader=yaml.FullLoader))
			
 
				+    ts = str(int(time.time()))
			
 
				+    work_dir = args.wd
			
 
				+    checkpoint_dir = os.path.join(work_dir, "checkpoints")
			
 
				+    if not os.path.exists(checkpoint_dir) and is_master:
			
 
				+        logger.info(f"Creating checkpoint dir: {checkpoint_dir}")
			
 
				+        # checkpoint_dir is not going to be used before syncs downstream,
			
 
				+        #   so don't expect racing condition, and don't run barrier
			
 
				+        os.makedirs(checkpoint_dir)
			
 
				+    config_path = os.path.join(work_dir, f"{ts}_config.yaml")
			
 
				+    # copy to work dir to keep a snapshot of workflow config
			
 
				+    if is_master:
			
 
				+        shutil.copy(args.params, config_path)
			
 
				+    log_path = os.path.join(work_dir, "train_log.txt")
			
 
				+    logger.info(f"Set logging to {log_path}")
			
 
				+    set_file_output_for_loggers(log_path)
			
 
				+    try:
			
 
				+        run_training(parameters=parameters, work_dir=work_dir, checkpoint_dir=checkpoint_dir)
			
 
				+    except Exception:
			
 
				+        # make sure that the stack tracke will be logged to log files
			
 
				+        logger.exception("Training failed")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/scripts/m4t/train/run_with_slurm.py
+++ b/scripts/m4t/train/run_with_slurm.py
@@ -0,0 +1,166 @@
 
				+import argparse
			
 
				+import logging
			
 
				+import os
			
 
				+import platform
			
 
				+import shutil
			
 
				+import subprocess
			
 
				+import time
			
 
				+from pathlib import Path
			
 
				+
			
 
				+
			
 
				+logging_format = f"%(asctime)s - {platform.node()} - %(process)s - %(levelname)s - %(name)s: %(message)s"
			
 
				+logging.basicConfig(
			
 
				+    level=logging.INFO,
			
 
				+    format=logging_format,
			
 
				+)
			
 
				+
			
 
				+logger = logging.getLogger("train")
			
 
				+
			
 
				+
			
 
				+def init_parser() -> argparse.ArgumentParser:
			
 
				+    parser = argparse.ArgumentParser(description="Run M4T training")
			
 
				+    parser.add_argument(
			
 
				+        "-w",
			
 
				+        type=Path,
			
 
				+        required=True,
			
 
				+        help="Work directory, where logs, checkpoints and core dumps will be stored",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "-p",
			
 
				+        type=Path,
			
 
				+        required=True,
			
 
				+        help="Training workflow config",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "-n",
			
 
				+        type=int,
			
 
				+        required=False,
			
 
				+        default=1,
			
 
				+        help="Number of training nodes",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "-c",
			
 
				+        type=str,
			
 
				+        required=False,
			
 
				+        default="seamless",
			
 
				+        help="Cluster partitions to use",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "-j",
			
 
				+        type=str,
			
 
				+        required=False,
			
 
				+        default="train",
			
 
				+        help="Slurm job name",
			
 
				+    )
			
 
				+    return parser
			
 
				+
			
 
				+
			
 
				+def prepare_sbatch_config(
			
 
				+    job_name: str,
			
 
				+    params_file: str,
			
 
				+    num_nodes: int,
			
 
				+    partitions: str,
			
 
				+    work_dir: str,
			
 
				+    cluster_logs_dir: str,
			
 
				+    run_script: str,
			
 
				+) -> str:
			
 
				+    return f"""#!/bin/bash
			
 
				+## job name
			
 
				+#SBATCH --job-name={job_name}
			
 
				+
			
 
				+## filename for job standard output (stdout)
			
 
				+## %j is the job id, %u is the user id
			
 
				+#SBATCH --output={cluster_logs_dir}/%j.out
			
 
				+
			
 
				+## filename for job standard error output (stderr)
			
 
				+#SBATCH --error={cluster_logs_dir}/%j.err
			
 
				+
			
 
				+## partition name
			
 
				+#SBATCH --partition={partitions}
			
 
				+
			
 
				+## number of nodes
			
 
				+#SBATCH --nodes={num_nodes}
			
 
				+
			
 
				+## number of nodes
			
 
				+#SBATCH --gpus-per-node=8
			
 
				+
			
 
				+## number of cpus per task
			
 
				+#SBATCH --cpus-per-task=96
			
 
				+
			
 
				+#SBATCH --gres=gpu:8
			
 
				+
			
 
				+## number of tasks per node
			
 
				+#SBATCH --ntasks-per-node=1
			
 
				+
			
 
				+## amount of mem
			
 
				+#SBATCH --mem 50G
			
 
				+
			
 
				+## amount of time in minutes
			
 
				+#SBATCH --time 2400
			
 
				+
			
 
				+set -x
			
 
				+export WANDB_DISABLED=true
			
 
				+export HDF5_USE_FILE_LOCKING='FALSE'
			
 
				+export PARENT=`/bin/hostname -s`
			
 
				+export MPORT=24198
			
 
				+export CHILDREN=`scontrol show hostnames $SLURM_JOB_NODELIST | grep -v $PARENT`
			
 
				+export HOSTLIST="$PARENT $CHILDREN"
			
 
				+echo $HOSTLIST
			
 
				+export WORLD_SIZE=$SLURM_NTASKS
			
 
				+srun --label bash -c 'which python && torchrun \\
			
 
				+ --nproc_per_node=8 \\
			
 
				+ --nnodes=$SLURM_JOB_NUM_NODES \\
			
 
				+ --node_rank="$SLURM_PROCID" \\
			
 
				+ --master_addr="$PARENT" \\
			
 
				+ --master_port="$MPORT" \\
			
 
				+ --log-dir={cluster_logs_dir} \\
			
 
				+{run_script} --params {params_file}  --wd {work_dir}'
			
 
				+"""
			
 
				+
			
 
				+
			
 
				+def main() -> None:
			
 
				+    args = init_parser().parse_args()
			
 
				+    params_file = args.p
			
 
				+    num_nodes = args.n
			
 
				+    partitions = args.c
			
 
				+    work_dir = args.w
			
 
				+    job_name = args.j
			
 
				+
			
 
				+    assert job_name is not None
			
 
				+    assert len(job_name.split()) == 1, "spaces in job name not allowed"
			
 
				+    assert partitions and len(partitions.split()) == 1, "spaces in partitions not allowed"
			
 
				+    assert os.path.exists(params_file), "config file is missing"
			
 
				+    training_script_path = os.path.join(os.path.dirname(__file__), "run_training.py")
			
 
				+    assert os.path.exists(training_script_path), f"Can't find training script {training_script_path}"
			
 
				+    assert num_nodes > 0
			
 
				+    if not os.path.exists(work_dir):
			
 
				+        logger.info(f"Creating workdir {work_dir}")
			
 
				+        os.makedirs(work_dir)
			
 
				+    cluster_logs_dir = os.path.join(work_dir, "cluster_logs")
			
 
				+    if os.path.exists(cluster_logs_dir):
			
 
				+        logger.info(f"Clearing cluster logs dir {cluster_logs_dir}")
			
 
				+        shutil.rmtree(cluster_logs_dir)
			
 
				+    os.makedirs(cluster_logs_dir)
			
 
				+    config_text = prepare_sbatch_config(
			
 
				+        job_name=job_name,
			
 
				+        params_file=params_file,
			
 
				+        num_nodes=num_nodes,
			
 
				+        partitions=partitions,
			
 
				+        work_dir=work_dir,
			
 
				+        cluster_logs_dir=cluster_logs_dir,
			
 
				+        run_script=training_script_path,
			
 
				+    )
			
 
				+    logger.info(f"SBATCH config to launch: \n{config_text}")
			
 
				+    fname = f"{int(time.time())}_sbatch.sh"
			
 
				+    config_path = os.path.join(work_dir, fname)
			
 
				+    with open(config_path, "w") as fp_out:
			
 
				+        fp_out.write(config_text)
			
 
				+        logger.info(f"Saved to {config_path}")
			
 
				+    command = f"sbatch {config_path}"
			
 
				+    logger.info(f"Executing command: '{command}'")
			
 
				+    subprocess.Popen(command, shell=True).communicate()
			
 
				+    logger.info(f"Train log: {os.path.join(work_dir, 'train_log.txt')}")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/scripts/m4t/train/trainer.py
+++ b/scripts/m4t/train/trainer.py
@@ -0,0 +1,394 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+
			
 
				+import logging
			
 
				+from typing import Any, Optional, Tuple, Dict, List
			
 
				+
			
 
				+import os
			
 
				+import time
			
 
				+import torch
			
 
				+import torch.distributed as dist
			
 
				+import torch.nn as nn
			
 
				+from fairseq2.models.sequence import SequenceModelOutput
			
 
				+from fairseq2.optim.lr_scheduler import MyleLR
			
 
				+from m4t_scripts.train import dataloader, dist_utils
			
 
				+from torch.optim import Adam
			
 
				+
			
 
				+from seamless_communication.models.unity import UnitYModel, UnitYT2UModel
			
 
				+from m4t_scripts.train.configs import TrainingParams
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+class UnitYTrainWrapper(nn.Module):
			
 
				+    """Convenience wrapper that does a forward pass
			
 
				+    and returns S2T and T2U logits"""
			
 
				+
			
 
				+    def __init__(self, model: UnitYModel):
			
 
				+        super().__init__()
			
 
				+        self.model: UnitYModel = model
			
 
				+        if isinstance(self.model.t2u_model, UnitYT2UModel):
			
 
				+            self.t2u: UnitYT2UModel = self.model.t2u_model
			
 
				+        else:
			
 
				+            raise NotImplementedError("Expand UnitYTrainWrapper supports only instances of UnitYT2UModel as t2u")
			
 
				+
			
 
				+    def forward(self, batch: dataloader.MultimodalSeqsBatch) -> Tuple[torch.Tensor, torch.Tensor]:
			
 
				+        """Forward pass, computes S2T and T2U losses"""
			
 
				+        assert self.model.t2u_model is not None
			
 
				+        assert batch.speech_to_text.src_tokens is not None
			
 
				+        # s2t
			
 
				+        speech_encoder_out, speech_encoder_padding_mask = self.model.encode_speech(
			
 
				+            seqs=batch.speech_to_text.src_tokens,
			
 
				+            seq_lens=batch.speech_to_text.src_lengths,
			
 
				+        )
			
 
				+        assert batch.speech_to_text.prev_output_tokens is not None
			
 
				+        text_decoder_out, text_decoder_padding_mask = self.model.decode(
			
 
				+            seqs=batch.speech_to_text.prev_output_tokens,
			
 
				+            seq_lens=batch.speech_to_text.target_lengths,
			
 
				+            encoder_output=speech_encoder_out,
			
 
				+            encoder_padding_mask=speech_encoder_padding_mask,
			
 
				+        )
			
 
				+        text_logits = self.model.final_proj(text_decoder_out)
			
 
				+        # t2u
			
 
				+        (
			
 
				+            unit_encoder_out,
			
 
				+            unit_encoder_padding_mask,
			
 
				+        ) = self.t2u.encode(
			
 
				+            text_decoder_output=text_decoder_out,
			
 
				+            text_decoder_padding_mask=text_decoder_padding_mask,
			
 
				+        )
			
 
				+        unit_decoder_out, _ = self.t2u.decode(
			
 
				+            seqs=batch.text_to_units.prev_output_tokens,
			
 
				+            seq_lens=batch.text_to_units.target_lengths,
			
 
				+            encoder_output=unit_encoder_out,
			
 
				+            encoder_padding_mask=unit_encoder_padding_mask,
			
 
				+        )
			
 
				+        unit_logits = self.model.t2u_model.final_proj(unit_decoder_out)
			
 
				+        return (text_logits, unit_logits)
			
 
				+
			
 
				+
			
 
				+class CalcLoss:
			
 
				+    """Calculates per-token negative log likelihood loss for S2T and T2U"""
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        label_smoothing: float,
			
 
				+        s2t_pad_idx: Optional[int],
			
 
				+        t2u_pad_idx: Optional[int],
			
 
				+        s2t_skip_langtok_loss: bool = False,
			
 
				+    ):
			
 
				+        self.label_smoothing = label_smoothing
			
 
				+        self.s2t_pad_idx = s2t_pad_idx
			
 
				+        self.t2u_pad_idx = t2u_pad_idx
			
 
				+        self.s2t_ignore_prefix_size = 1 if s2t_skip_langtok_loss else 0
			
 
				+        self.t2u_ignore_prefix_size = 1
			
 
				+
			
 
				+    def __call__(
			
 
				+        self,
			
 
				+        batch: dataloader.MultimodalSeqsBatch,
			
 
				+        text_logits: torch.Tensor,
			
 
				+        unit_logits: torch.Tensor,
			
 
				+    ) -> torch.Tensor:
			
 
				+        assert batch.speech_to_text.target_lengths is not None
			
 
				+        s2t_numel = torch.sum(batch.speech_to_text.target_lengths).to(text_logits.device)
			
 
				+        s2t_loss = SequenceModelOutput(logits=text_logits, pad_idx=self.s2t_pad_idx).compute_loss(
			
 
				+            targets=batch.speech_to_text.target_tokens.to(text_logits.device),
			
 
				+            ignore_prefix_size=self.s2t_ignore_prefix_size,
			
 
				+            label_smoothing=self.label_smoothing,
			
 
				+        )
			
 
				+        assert batch.text_to_units.target_lengths is not None
			
 
				+        s2u_numel = torch.sum(batch.text_to_units.target_lengths).to(unit_logits.device)
			
 
				+        s2u_loss = SequenceModelOutput(logits=unit_logits, pad_idx=self.t2u_pad_idx).compute_loss(
			
 
				+            targets=batch.text_to_units.target_tokens.to(unit_logits.device),
			
 
				+            ignore_prefix_size=1,
			
 
				+            label_smoothing=self.label_smoothing,
			
 
				+        )
			
 
				+        return s2t_loss / s2t_numel + s2u_loss / s2u_numel
			
 
				+
			
 
				+
			
 
				+class LossCollector:
			
 
				+    """Aggregrates loss history across nodes"""
			
 
				+
			
 
				+    def __init__(self, device: Optional[torch.device] = None, reduce_op: str = "avg"):
			
 
				+        self.n_samples: float = 0
			
 
				+        self.val_sum: float = 0.0
			
 
				+        self.reduce_op = reduce_op
			
 
				+        self.device = device
			
 
				+        self.is_distributed = dist_utils.is_dist_initialized()
			
 
				+
			
 
				+    def reset(self) -> None:
			
 
				+        self.n_samples = 0
			
 
				+        self.val_sum = 0.0
			
 
				+
			
 
				+    def update(self, n_samples: int, batch_loss: float) -> None:
			
 
				+        self.n_samples += n_samples
			
 
				+        self.val_sum += batch_loss
			
 
				+
			
 
				+    def reduce(self) -> float:
			
 
				+        n_samples, val_sum = self._collect()
			
 
				+        if self.reduce_op == "avg":
			
 
				+            return val_sum / (n_samples + 1)
			
 
				+        if self.reduce_op == "sum":
			
 
				+            return val_sum
			
 
				+        raise ValueError()
			
 
				+
			
 
				+    def _collect(self) -> Tuple[float, float]:
			
 
				+        if not self.is_distributed:
			
 
				+            return self.n_samples, self.val_sum
			
 
				+        local_val = torch.tensor([[self.n_samples, self.val_sum]], device=self.device)
			
 
				+        all_vals = [torch.zeros((1, 2), device=self.device) for _ in range(dist_utils.get_world_size())]
			
 
				+        dist.all_gather(all_vals, local_val)
			
 
				+        losses = torch.concat(all_vals, dim=0)
			
 
				+        reduced = torch.sum(losses, dim=0).reshape(2).cpu()
			
 
				+        return reduced[0].item(), reduced[1].item()
			
 
				+
			
 
				+
			
 
				+class UnitYTrainer:
			
 
				+    CHECKPOINT_BEST = "checkpoint_best.pt"
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        model: UnitYModel,
			
 
				+        params: TrainingParams,
			
 
				+        train_data_loader: dataloader.UnityDataLoader,
			
 
				+        eval_data_loader: Optional[dataloader.UnityDataLoader],
			
 
				+        chck_save_dir: str,
			
 
				+        device: torch.device,
			
 
				+    ):
			
 
				+        self.params = params
			
 
				+        self.device = device
			
 
				+        self.float_dtype = self._get_float_dtype(self.params.float_dtype)
			
 
				+        self.train_data_loader = train_data_loader
			
 
				+        self.eval_data_loader = eval_data_loader
			
 
				+        self.chck_save_dir = chck_save_dir
			
 
				+
			
 
				+        assert model.t2u_model is not None
			
 
				+        self.calc_loss = CalcLoss(
			
 
				+            label_smoothing=self.params.label_smoothing,
			
 
				+            s2t_pad_idx=model.pad_idx,
			
 
				+            t2u_pad_idx=model.t2u_model.pad_idx,
			
 
				+        )
			
 
				+        self._try_load_checkpoint(model=model)
			
 
				+        self.model = self._wrap_model_for_trainining(model=model)
			
 
				+
			
 
				+        # TODO: make tweakable
			
 
				+        self.optimizer = Adam(
			
 
				+            params=self.model.parameters(),
			
 
				+            lr=self.params.learning_rate,
			
 
				+            betas=(0.9, 0.98),
			
 
				+            eps=1e-08,
			
 
				+            maximize=False,
			
 
				+            weight_decay=0.0,
			
 
				+            fused=True,
			
 
				+        )
			
 
				+
			
 
				+        self.grad_scaler = torch.cuda.amp.GradScaler() if self.float_dtype == torch.float16 else None  # type: ignore
			
 
				+
			
 
				+        # TODO: allow scheduler selection
			
 
				+        self.lr_scheduler = MyleLR(
			
 
				+            optimizer=self.optimizer,
			
 
				+            num_warmup_steps=self.params.warmup_steps,
			
 
				+            start_lr=self.params.start_learning_rate,
			
 
				+        )
			
 
				+
			
 
				+        self.train_loss_hist = LossCollector(device=self.device)
			
 
				+        self.epoch_idx: int = 0
			
 
				+        self.update_idx: int = 0
			
 
				+        self.patience_left: int = self.params.patience
			
 
				+        self.last_eval_loss: Optional[float] = None
			
 
				+        self.best_eval_loss: Optional[float] = None
			
 
				+        self.is_best_state: bool = False
			
 
				+        self.batch_sizes: List[int] = []
			
 
				+        self.gpu_usage: List[float] = []
			
 
				+
			
 
				+    def _try_load_checkpoint(self, model: torch.nn.Module):
			
 
				+        chck_path = self.get_best_checkpoint_path()
			
 
				+        if os.path.exists(chck_path):
			
 
				+            logger.info(f"Loading state dict from {chck_path}")
			
 
				+            state_dict = torch.load(chck_path)
			
 
				+            model.load_state_dict(state_dict)
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _get_float_dtype(cls, float_dtype: str) -> torch.dtype:
			
 
				+        if float_dtype == "fp16":
			
 
				+            return torch.float16
			
 
				+        elif float_dtype == "fp32":
			
 
				+            return torch.float32
			
 
				+        elif float_dtype == "bf16":
			
 
				+            return torch.bfloat16
			
 
				+        else:
			
 
				+            raise ValueError(f"Unkown dtype literal: {float_dtype}")
			
 
				+
			
 
				+    def _reset_stats(self) -> None:
			
 
				+        self.train_loss_hist.reset()
			
 
				+        self.epoch_idx = 0
			
 
				+        self.update_idx = 0
			
 
				+        self.patience_left = self.params.patience
			
 
				+        self.last_eval_loss = None
			
 
				+        self.best_eval_loss = None
			
 
				+        self.is_best_state = False
			
 
				+        self._reset_log_stats()
			
 
				+
			
 
				+    def _reset_log_stats(self) -> None:
			
 
				+        self.batch_sizes.clear()
			
 
				+        self.gpu_usage.clear()
			
 
				+        self.ts = time.time()
			
 
				+        self.last_update_idx = self.update_idx
			
 
				+
			
 
				+    def _record_gpu_usage(self) -> None:
			
 
				+        gb = (torch.cuda.memory_reserved(self.device) >> 20) / 1024.0
			
 
				+        self.gpu_usage.append(gb)
			
 
				+
			
 
				+    def _get_avg_bsz(self) -> float:
			
 
				+        """Avg training batch size"""
			
 
				+        return sum(self.batch_sizes) / len(self.batch_sizes) if self.batch_sizes else 0.0
			
 
				+
			
 
				+    def _get_ups(self) -> float:
			
 
				+        """Updates per second"""
			
 
				+        ts_delta = time.time() - self.ts
			
 
				+        return (self.update_idx - self.last_update_idx) / ts_delta
			
 
				+
			
 
				+    def _get_avg_gpu_usage(self) -> float:
			
 
				+        return sum(self.gpu_usage) / len(self.gpu_usage) if self.gpu_usage else 0.0
			
 
				+
			
 
				+    def _wrap_model_for_trainining(self, model: UnitYModel) -> nn.Module:
			
 
				+        wrapped_model = UnitYTrainWrapper(model=model)
			
 
				+        if not dist_utils.is_dist_initialized():
			
 
				+            return wrapped_model
			
 
				+        return nn.parallel.DistributedDataParallel(
			
 
				+            wrapped_model,
			
 
				+            device_ids=[dist_utils.get_local_rank()],
			
 
				+            find_unused_parameters=True,
			
 
				+        )
			
 
				+
			
 
				+    def _update_eval_stats(self, eval_loss: float) -> None:
			
 
				+        self.last_eval_loss = eval_loss
			
 
				+        self.is_best_state = self.best_eval_loss is None or eval_loss < self.best_eval_loss
			
 
				+        self.best_eval_loss = eval_loss if self.is_best_state else self.best_eval_loss
			
 
				+        self.patience_left = self.params.patience if self.is_best_state else self.patience_left - 1
			
 
				+        logger.info(
			
 
				+            f"Eval after {self.update_idx} updates: "
			
 
				+            f"loss={eval_loss:.4f} "
			
 
				+            f"best_loss={self.best_eval_loss:.4f} "
			
 
				+            f"patience_steps_left={self.patience_left}"
			
 
				+        )
			
 
				+
			
 
				+    def _eval_model(self) -> None:
			
 
				+        """Calc avg loss on eval dataset and update evaluation stats"""
			
 
				+        if self.eval_data_loader is None:
			
 
				+            return
			
 
				+        logger.info("Run evaluation")
			
 
				+        loss_hist = LossCollector(device=self.device)
			
 
				+        self.model.eval()
			
 
				+        with torch.no_grad():
			
 
				+            self.eval_data_loader.reset()
			
 
				+            for batch in self.eval_data_loader.iterate_batches():
			
 
				+                assert batch.speech_to_text.src_tokens is not None
			
 
				+                loss = self.calc_loss(batch, *self.model(batch))
			
 
				+                if loss.isnan():
			
 
				+                    logger.warning("Eval loss value is NaN, setting to inf")
			
 
				+                    loss_val = float("Inf")
			
 
				+                else:
			
 
				+                    loss_val = loss.item()
			
 
				+                del batch  # force memory release
			
 
				+                loss_hist.update(1, loss_val)
			
 
				+        eval_loss = loss_hist.reduce()
			
 
				+        self._update_eval_stats(eval_loss)
			
 
				+
			
 
				+    def _train_step_log(self):
			
 
				+        """Log train stats"""
			
 
				+        if (self.update_idx + 1) % self.params.log_steps == 0:
			
 
				+            avg_loss = self.train_loss_hist.reduce()
			
 
				+            self.train_loss_hist.reset()
			
 
				+            logger.info(
			
 
				+                f"Epoch {str(self.epoch_idx + 1).zfill(3)} / "
			
 
				+                f"update {str(self.update_idx + 1).zfill(5)}: "
			
 
				+                f"train loss={avg_loss:.4f} "
			
 
				+                f"last lr={self.lr_scheduler.get_last_lr()[0]:.2E} "
			
 
				+                f"bsz_avg={self._get_avg_bsz():.1f} "
			
 
				+                f"ups={self._get_ups():.2f} "
			
 
				+                f"gpu_avg={self._get_avg_gpu_usage():.2f}Gb"
			
 
				+            )
			
 
				+            self._reset_log_stats()
			
 
				+
			
 
				+    def _train_step(self, batch: dataloader.MultimodalSeqsBatch) -> None:
			
 
				+        """Run one train step"""
			
 
				+        self.model.train()
			
 
				+        self.optimizer.zero_grad()
			
 
				+        tokens, units = self.model(batch)
			
 
				+        loss = self.calc_loss(batch, tokens, units)
			
 
				+        # peak of gpu usage
			
 
				+        self._record_gpu_usage()
			
 
				+
			
 
				+        if self.grad_scaler is not None:
			
 
				+            self.grad_scaler.scale(loss).backward()  # type: ignore
			
 
				+            self.grad_scaler.step(self.optimizer)
			
 
				+            self.grad_scaler.update()
			
 
				+        else:
			
 
				+            loss.backward()
			
 
				+            self.optimizer.step()
			
 
				+
			
 
				+        self.lr_scheduler.step()
			
 
				+        assert batch.speech_to_text.src_tokens is not None
			
 
				+        self.train_loss_hist.update(1, loss.item())
			
 
				+        self.batch_sizes.append(batch.speech_to_text.src_tokens.shape[0])
			
 
				+        self._train_step_log()
			
 
				+
			
 
				+    def _get_state(self) -> Dict[str, Any]:
			
 
				+        model_state_dict = self.model.state_dict()
			
 
				+        model_state_dict = {key.replace("module.model.", ""): value for key, value in model_state_dict.items()}
			
 
				+        return model_state_dict
			
 
				+
			
 
				+    def _get_chck_path(self) -> str:
			
 
				+        ts = str(int(time.time()))
			
 
				+        epoch = str(self.epoch_idx).zfill(3)
			
 
				+        update = str(self.update_idx).zfill(6)
			
 
				+        eval_loss = f"{self.last_eval_loss:.4f}"
			
 
				+        name = f"{ts}_{epoch}_{update}_{eval_loss}.pt"
			
 
				+        return os.path.join(self.chck_save_dir, name)
			
 
				+
			
 
				+    def _get_best_checkpoint_link_path(self) -> str:
			
 
				+        return os.path.join(self.chck_save_dir, self.CHECKPOINT_BEST)
			
 
				+
			
 
				+    def get_best_checkpoint_path(self) -> str:
			
 
				+        return os.path.realpath(self._get_best_checkpoint_link_path())
			
 
				+
			
 
				+    def _save_model(self):
			
 
				+        if dist_utils.is_main_process():
			
 
				+            state_dict = self._get_state()
			
 
				+            save_path = self._get_chck_path()
			
 
				+            logger.info(f"Saving checkpoint to {save_path}")
			
 
				+            torch.save(state_dict, save_path)
			
 
				+            if self.is_best_state:
			
 
				+                best_link_path = self._get_best_checkpoint_link_path()
			
 
				+                if os.path.exists(best_link_path):
			
 
				+                    os.unlink(best_link_path)
			
 
				+                os.symlink(save_path, best_link_path)
			
 
				+                logger.info(f"Updating pointer to the best checkpoint {best_link_path} -> {save_path}")
			
 
				+        if dist_utils.is_dist_initialized():
			
 
				+            dist.barrier()
			
 
				+
			
 
				+    def run(self):
			
 
				+        logger.info("Start training")
			
 
				+        self._reset_stats()
			
 
				+        self._eval_model()
			
 
				+        while self.epoch_idx < self.params.max_epochs and self.patience_left:
			
 
				+            for train_batch in self.train_data_loader.iterate_batches():
			
 
				+                self._train_step(batch=train_batch)
			
 
				+                if self.update_idx and self.update_idx % self.params.eval_steps == 0:
			
 
				+                    self._eval_model()
			
 
				+                    if self.is_best_state:
			
 
				+                        self._save_model()
			
 
				+                    elif not self.patience_left:
			
 
				+                        no_improve_steps = self.params.eval_steps * self.params.patience
			
 
				+                        logger.info(
			
 
				+                            f"Early termination, as eval loss did not improve over last {no_improve_steps} updates"
			
 
				+                        )
			
 
				+                        break
			
 
				+                self.update_idx += 1
			
 
				+            self.train_data_loader.reset()
			
 
				+            self.epoch_idx += 1
			
--- a/src/seamless_communication/models/tokenizer.py
+++ b/src/seamless_communication/models/tokenizer.py
@@ -0,0 +1,105 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the BSD-style license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+from typing import Optional, Sequence, Set, final
			
 
				+
			
 
				+from fairseq2.data.text import (
			
 
				+    SentencePieceDecoder,
			
 
				+    SentencePieceEncoder,
			
 
				+    SentencePieceModel,
			
 
				+    TextTokenDecoder,
			
 
				+    TextTokenEncoder,
			
 
				+    TextTokenizer,
			
 
				+    vocabulary_from_sentencepiece,
			
 
				+)
			
 
				+from fairseq2.data.typing import PathLike
			
 
				+from fairseq2.typing import Device, finaloverride
			
 
				+
			
 
				+
			
 
				+@final
			
 
				+class SPMTokenizer(TextTokenizer):
			
 
				+    """Represents standard SPM-based tokenizer used in MT tasks"""
			
 
				+
			
 
				+    model: SentencePieceModel
			
 
				+    langs: Set[str]
			
 
				+    prepend_target_langtok_to_target: bool
			
 
				+
			
 
				+    def __init__(self, pathname: PathLike, langs: Sequence[str], prepend_target_langtok_to_target: bool = True) -> None:
			
 
				+        """
			
 
				+        :param pathname:
			
 
				+            The pathname of the SentencePiece model file.
			
 
				+        :param langs:
			
 
				+            The list of supported languages.
			
 
				+        :param default_lang:
			
 
				+            The fall-back language if no language is specified.
			
 
				+        """
			
 
				+        self.langs = set(langs)
			
 
				+        self.prepend_target_langtok_to_target = prepend_target_langtok_to_target
			
 
				+
			
 
				+        # Each language is represented by a `__lang__` control symbol.
			
 
				+        control_symbols = [self._lang_tok_to_internal(lang) for lang in sorted(langs)]
			
 
				+        self.model = SentencePieceModel(pathname, control_symbols)
			
 
				+        vocab_info = vocabulary_from_sentencepiece(self.model)
			
 
				+        super().__init__(vocab_info)
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _lang_tok_to_internal(cls, lang: str) -> str:
			
 
				+        return f"__{lang}__"
			
 
				+
			
 
				+    @finaloverride
			
 
				+    def create_encoder(
			
 
				+        self,
			
 
				+        *,
			
 
				+        task: Optional[str] = None,
			
 
				+        lang: Optional[str] = None,
			
 
				+        mode: Optional[str] = None,
			
 
				+        device: Optional[Device] = None,
			
 
				+        pin_memory: bool = False,
			
 
				+    ) -> TextTokenEncoder:
			
 
				+        """Create a token encoder.
			
 
				+
			
 
				+        :param task:
			
 
				+            Must be 'translation'. If ``None``, defaults to 'translation'.
			
 
				+        :param lang:
			
 
				+            A language from :attr:`langs`. If ``None``, defaults to
			
 
				+            :attr:`default_lang`.
			
 
				+        :param mode:
			
 
				+            Must be 'source' or 'target'.
			
 
				+        :param device:
			
 
				+            The device on which to construct tensors.
			
 
				+        :param pin_memory:
			
 
				+            If ``True``, uses pinned memory while constructing tensors.
			
 
				+        """
			
 
				+        if task is not None and task != "translation":
			
 
				+            raise ValueError(f"`task` must be 'translation', but is '{task}' instead.")
			
 
				+
			
 
				+        assert lang is not None
			
 
				+
			
 
				+        if lang not in self.langs:
			
 
				+            raise ValueError(f"`lang` must be a supported language, but is '{lang}' instead.")
			
 
				+
			
 
				+        if mode is None or mode == "source":
			
 
				+            prefix_tokens = []
			
 
				+            suffix_tokens = ["</s>"]
			
 
				+        elif mode == "target":
			
 
				+            prefix_tokens = (
			
 
				+                ["</s>"] + [self._lang_tok_to_internal(lang)] if self.prepend_target_langtok_to_target else []
			
 
				+            )
			
 
				+            suffix_tokens = ["</s>"]
			
 
				+        else:
			
 
				+            raise ValueError(f"`mode` must be 'source' or 'target', but is '{mode}' instead.")
			
 
				+
			
 
				+        return SentencePieceEncoder(
			
 
				+            self.model,
			
 
				+            prefix_tokens=prefix_tokens,
			
 
				+            suffix_tokens=suffix_tokens,
			
 
				+            device=device,
			
 
				+            pin_memory=pin_memory,
			
 
				+        )
			
 
				+
			
 
				+    @finaloverride
			
 
				+    def create_decoder(self) -> TextTokenDecoder:
			
 
				+        return SentencePieceDecoder(self.model)