2 år sedan · c4e2953141
--- a/scripts/m4t/train/configs.py
+++ b/scripts/m4t/train/configs.py
@@ -131,6 +131,9 @@ class DataLoadingConfig(Config):
 
															     max_tgt_text_tokens_per_batch: Optional[int] = 1000
														
 
															     """ Defines flexible batch construction """
														
 
															+    max_batch_size: Optional[int] = None
														
 
															+    """ In flexible batch construction sets max allowed size"""
														
 
															+
														
 
															     fixed_batch_size: Optional[int] = None
														
 
															     """ If set, uses fixed batch size """
														
--- a/scripts/m4t/train/dataloader.py
+++ b/scripts/m4t/train/dataloader.py
@@ -76,7 +76,9 @@ class UnityDataLoader:
 
															         self.manifest_paths = list(self._iterate_manifest_paths())
														
 
															         self.text_tokenizer = self._init_text_tokenizer()
														
 
															         self.unit_tokenizer = self._init_unit_tokenizer()
														
 
															-        self.spm_encoder = SentencePieceEncoder(model=self.text_tokenizer.model, suffix_tokens=["</s>"])
														
 
															+        self.spm_encoder = SentencePieceEncoder(
														
 
															+            model=self.text_tokenizer.model, suffix_tokens=["</s>"]
														
 
															+        )
														
 
															         self.text_prefix_tokens = self._build_text_tgt_prefixes()
														
 
															         self.unit_prefix_tokens = self._build_unit_tgt_prefixes()
														
 
															         if self.config.fixed_batch_size is None:
														
@@ -88,18 +90,25 @@ class UnityDataLoader:
 
															     @classmethod
														
 
															     def _set_mkl_num_threads(cls):
														
 
															-        """ Setting mkl num threads to 1, so that we don't get thread explosion."""
														
 
															-        mkl_rt = ctypes.CDLL('libmkl_rt.so')
														
 
															+        """Setting mkl num threads to 1, so that we don't get thread explosion."""
														
 
															+        mkl_rt = ctypes.CDLL("libmkl_rt.so")
														
 
															         mkl_rt.mkl_set_num_threads(ctypes.byref(ctypes.c_int(1)))
														
 
															     def _calculate_tgt_text_batch_shapes(self) -> List[Tuple[int, int]]:
														
 
															         max_seq_len = self.config.max_tgt_text_tokens_per_sample
														
 
															         max_tokens_per_batch = self.config.max_tgt_text_tokens_per_batch
														
 
															         assert max_tokens_per_batch is not None, "max_tokens_per_batch is not set"
														
 
															+        max_bsz = (
														
 
															+            self.config.max_batch_size
														
 
															+            if self.config.max_batch_size is not None
														
 
															+            else max_tokens_per_batch
														
 
															+        )
														
 
															         step = self.BATCH_WIDTH_STEP
														
 
															         bucket_sizes = []
														
 
															         for seq_len in range(step, max(step, max_seq_len) + 1, step):
														
 
															             bsz = max(1, max_tokens_per_batch // seq_len)
														
 
															+            if bsz > max_bsz:
														
 
															+                continue
														
 
															             bucket_sizes.append((bsz, seq_len))
														
 
															         return bucket_sizes
														
@@ -128,7 +137,8 @@ class UnityDataLoader:
 
															             assert self.config.text_tokenization.langtoks is not None
														
 
															             assert self.config.text_tokenization.spm_path is not None
														
 
															             return SPMTokenizer(
														
 
															-                pathname=self.config.text_tokenization.spm_path, langs=self.config.text_tokenization.langtoks
														
 
															+                pathname=self.config.text_tokenization.spm_path,
														
 
															+                langs=self.config.text_tokenization.langtoks,
														
 
															             )
														
 
															     def _init_unit_tokenizer(self) -> UnitTokenizer:
														
@@ -154,7 +164,9 @@ class UnityDataLoader:
 
															     def _infer_manifest_full_path(self, manifest_name: str) -> str:
														
 
															         full_path = manifest_name.strip()
														
 
															         if self.config.manifest_path_prefix is not None:
														
 
															-            full_path = os.path.join(self.config.manifest_path_prefix.strip(), full_path)
														
 
															+            full_path = os.path.join(
														
 
															+                self.config.manifest_path_prefix.strip(), full_path
														
 
															+            )
														
 
															         if not full_path.endswith(self.MANIFEST_EXT) and not os.path.exists(full_path):
														
 
															             full_path += self.MANIFEST_EXT
														
 
															         if not os.path.exists(full_path):
														
@@ -188,7 +200,9 @@ class UnityDataLoader:
 
															             self.TARGET_LANG_COLUMN,
														
 
															         ]:
														
 
															             if column not in column_names:
														
 
															-                raise ValueError(f"Column `{column}` is not present in `{manifest_path}` ")
														
 
															+                raise ValueError(
														
 
															+                    f"Column `{column}` is not present in `{manifest_path}` "
														
 
															+                )
														
 
															         return column_names
														
 
															     def _builder_from_manifest(self, manifest_path: str) -> DataPipelineBuilder:
														
@@ -230,7 +244,9 @@ class UnityDataLoader:
 
															         # Split each text line into its fields.
														
 
															         fields = self._read_column_names(manifest_path)
														
 
															         logger.debug(f"Column names: {fields}")
														
 
															-        txt_splitter = StrSplitter(sep=self.MANIFEST_COLUMN_SEP, names=fields, indices=[], exclude=True)
														
 
															+        txt_splitter = StrSplitter(
														
 
															+            sep=self.MANIFEST_COLUMN_SEP, names=fields, indices=[], exclude=True
														
 
															+        )
														
 
															         pipeline.map(
														
 
															             txt_splitter,
														
 
															             selector=self.ROOT_COLUMN,
														
@@ -244,7 +260,10 @@ class UnityDataLoader:
 
															         Picks samples from per-manifest pipelines in a round-robin order"""
														
 
															         # TODO: add the ability to upsample/downsample manifests
														
 
															         logger.info(f"Aggregating data from {len(self.manifest_paths)} manifests")
														
 
															-        builders = [self._builder_from_manifest(manifest_path=path) for path in self.manifest_paths]
														
 
															+        builders = [
														
 
															+            self._builder_from_manifest(manifest_path=path)
														
 
															+            for path in self.manifest_paths
														
 
															+        ]
														
 
															         pipelines = [builder.and_return() for builder in builders]
														
 
															         return DataPipeline.round_robin(pipelines=pipelines)
														
@@ -276,7 +295,7 @@ class UnityDataLoader:
 
															             channel_last=True,  # audio channel is the last dimension in the waveform
														
 
															             standardize=self.config.audio.fbanks_standardize_audio,
														
 
															             keep_waveform=False,
														
 
															-            device=self.target_device,
														
 
															+            device=self.CPU_DEVICE,  # avoid uncontrolled memory cons on GPUs
														
 
															             dtype=self.float_dtype,
														
 
															         )
														
 
															         builder.map(
														
@@ -286,7 +305,9 @@ class UnityDataLoader:
 
															         )
														
 
															         return builder
														
 
															-    def _attach_target_tokens(self, builder: DataPipelineBuilder) -> DataPipelineBuilder:
														
 
															+    def _attach_target_tokens(
														
 
															+        self, builder: DataPipelineBuilder
														
 
															+    ) -> DataPipelineBuilder:
														
 
															         # Convert `raw_tgt_text` to (full) target tokenized sequences:
														
 
															         #                   <eos> <lang_tok> <tokens .. > <eos>
														
 
															         # Lang tokens change between rows, so can't use static encoder
														
@@ -305,7 +326,10 @@ class UnityDataLoader:
 
															         # 3) Not a computational blocker
														
 
															         convert_to_units = lambda units_str: (  # noqa: E731
														
 
															             torch.LongTensor(
														
 
															-                [int(unit_id) + 4 for unit_id in units_str.rstrip().bytes().decode("utf-8").split()]
														
 
															+                [
														
 
															+                    int(unit_id) + 4
														
 
															+                    for unit_id in units_str.rstrip().bytes().decode("utf-8").split()
														
 
															+                ]
														
 
															                 + [self.unit_tokenizer.vocab_info.eos_idx]
														
 
															             )
														
 
															         )
														
@@ -340,23 +364,43 @@ class UnityDataLoader:
 
															     def _is_long_sample(self, sample: Any) -> bool:
														
 
															         # input audio length
														
 
															-        if self._get_input_audio_seconds(sample) > self.config.max_seconds_per_input_audio:
														
 
															+        if (
														
 
															+            self._get_input_audio_seconds(sample)
														
 
															+            > self.config.max_seconds_per_input_audio
														
 
															+        ):
														
 
															             return True
														
 
															         # target text tokens
														
 
															-        num_tgt_text_tokens = sample[self.ROOT_COLUMN][self.TARGET_TEXT_COLUMN].shape[-1]
														
 
															+        num_tgt_text_tokens = sample[self.ROOT_COLUMN][self.TARGET_TEXT_COLUMN].shape[
														
 
															+            -1
														
 
															+        ]
														
 
															         if num_tgt_text_tokens > self.config.max_tgt_text_tokens_per_sample:
														
 
															             return True
														
 
															         # target units
														
 
															-        num_tgt_units = sample[self.ROOT_COLUMN][self.TARGET_UNITS_COLUMN].shape[-1]  # target units
														
 
															+        num_tgt_units = sample[self.ROOT_COLUMN][self.TARGET_UNITS_COLUMN].shape[
														
 
															+            -1
														
 
															+        ]  # target units
														
 
															         if num_tgt_units > self.config.max_units_per_sample:
														
 
															             return True
														
 
															         return False
														
 
															+    def _nans_in_fbanks(self, sample: Any) -> bool:
														
 
															+        """Tells if NaNs present in fbank"""
														
 
															+        fbank = sample[self.ROOT_COLUMN][self.AUDIO_COLUMN_NAME]["data"]["fbank"]
														
 
															+        has_nans: bool = torch.any(torch.isnan(fbank)).item()  # type: ignore
														
 
															+        if has_nans:
														
 
															+            logger.warning("Sample fbank contains NaNs. Skipping")
														
 
															+        return has_nans
														
 
															+
														
 
															     def _filter_samples(self, builder: DataPipelineBuilder) -> DataPipelineBuilder:
														
 
															-        # Drop long samples
														
 
															-        builder.filter(lambda sample: not self._is_long_sample(sample))
														
 
															+        # Drop:
														
 
															+        #  - "long" samples
														
 
															+        #  - samples with fbanks that contain NaNs
														
 
															+        builder.filter(
														
 
															+            lambda sample: not self._is_long_sample(sample)
														
 
															+            and not self._nans_in_fbanks(sample)
														
 
															+        )
														
 
															         return builder
														
 
															     def _batch_samples(self, builder: DataPipelineBuilder) -> DataPipelineBuilder:
														
@@ -415,8 +459,12 @@ class UnityDataLoader:
 
															         prev_output_tokens = prev_output_tokens[:, :-1]
														
 
															         target_tokens = tokens[:, 1:]
														
 
															-        assert torch.equal(torch.count_nonzero(prev_output_tokens != pad_idx, dim=1), target_lengths)
														
 
															-        assert torch.equal(torch.count_nonzero(target_tokens != pad_idx, dim=1), target_lengths)
														
 
															+        assert torch.equal(
														
 
															+            torch.count_nonzero(prev_output_tokens != pad_idx, dim=1), target_lengths
														
 
															+        )
														
 
															+        assert torch.equal(
														
 
															+            torch.count_nonzero(target_tokens != pad_idx, dim=1), target_lengths
														
 
															+        )
														
 
															         return prev_output_tokens, target_tokens, target_lengths
														
 
															     def _get_text_to_units_batch(self, raw_batch: Any) -> SeqsBatch:
														
@@ -448,7 +496,9 @@ class UnityDataLoader:
 
															             prefix_tokens=prefix_tokens.to(self.target_device),
														
 
															         )
														
 
															-    def _get_speech_src_tokens_and_lengths(self, raw_batch: Any) -> Tuple[torch.Tensor, torch.Tensor]:
														
 
															+    def _get_speech_src_tokens_and_lengths(
														
 
															+        self, raw_batch: Any
														
 
															+    ) -> Tuple[torch.Tensor, torch.Tensor]:
														
 
															         fbanks = raw_batch[self.ROOT_COLUMN][self.AUDIO_COLUMN_NAME]["data"]["fbank"]
														
 
															         return fbanks["seqs"].to(self.float_dtype), fbanks["seq_lens"]
														
@@ -471,7 +521,9 @@ class UnityDataLoader:
 
															             pad_idx=pad_idx,
														
 
															             eos_idx=eos_idx,
														
 
															         )
														
 
															-        src_tokens, src_lengths = self._get_speech_src_tokens_and_lengths(raw_batch=raw_batch)
														
 
															+        src_tokens, src_lengths = self._get_speech_src_tokens_and_lengths(
														
 
															+            raw_batch=raw_batch
														
 
															+        )
														
 
															         return SeqsBatch(
														
 
															             src_tokens=src_tokens.to(self.target_device),
														
--- a/scripts/m4t/train/recipes/asr_small_wh_transc.yaml
+++ b/scripts/m4t/train/recipes/asr_small_wh_transc.yaml
@@ -9,7 +9,7 @@ eval_data:
 
															   manifest_list_path: null
														
 
															   manifest_path_prefix: /data/home/mavlyutov/s2t_ondevice/
														
 
															   max_seconds_per_input_audio: 15
														
 
															-  fixed_batch_size: 40
														
 
															+  fixed_batch_size: 30
														
 
															   max_tgt_text_tokens_per_batch: 1000
														
 
															   max_tgt_text_tokens_per_sample: 300
														
 
															   max_units_per_sample: 1500
														
@@ -63,7 +63,7 @@ train_data:
 
															   manifest_list_path: null
														
 
															   manifest_path_prefix: /data/home/mavlyutov/s2t_ondevice/
														
 
															   max_seconds_per_input_audio: 15
														
 
															-  fixed_batch_size: 40
														
 
															+  fixed_batch_size: 30
														
 
															   max_tgt_text_tokens_per_batch: 600
														
 
															   max_tgt_text_tokens_per_sample: 300
														
 
															   max_units_per_sample: 1500
														
@@ -87,7 +87,7 @@ train_data:
 
															   unit_tokenizer_name: seamlessM4T_large
														
 
															 training:
														
 
															   eval_steps: 1000 
														
 
															-  float_dtype: fp32
														
 
															+  float_dtype: bf16
														
 
															   label_smoothing: 0.2
														
 
															   learning_rate: 0.0001
														
 
															   log_steps:  50 
														
--- a/scripts/m4t/train/recipes/large_M4T_v1.yaml
+++ b/scripts/m4t/train/recipes/large_M4T_v1.yaml
@@ -9,7 +9,7 @@ eval_data:
 
															   manifest_list_path: null
														
 
															   manifest_path_prefix: /fsx-ust/spopuri/datasets/S2ST/V1/M4T_V1_phase2/primary/
														
 
															   max_seconds_per_input_audio: 150
														
 
															-  fixed_batch_size: 40
														
 
															+  fixed_batch_size: 10
														
 
															   max_tgt_text_tokens_per_batch: null
														
 
															   max_tgt_text_tokens_per_sample: 3000
														
 
															   max_units_per_sample: 1500
														
@@ -28,7 +28,20 @@ eval_data:
 
															   unit_tokenizer_name: seamlessM4T_large
														
 
															 model:
														
 
															   custom_params:
														
 
															+    model_embed_dim: 1024
														
 
															     nllb_vocabulary_size: 256103
														
 
															+    w2v2_encoder_layers: 24
														
 
															+    w2v2_encoder_layers_use_conformer: true
														
 
															+    w2v2_encoder_layers_layernorm_features: false
														
 
															+    w2v2_pos_encoder_type: "relative"
														
 
															+    w2v2_pos_encoder_depth: 0
														
 
															+    w2v2_pos_conv_kernel_size: 0
														
 
															+    w2v2_num_pos_conv_groups: 0
														
 
															+    nllb_encoder_layers: 24
														
 
															+    nllb_decoder_layers: 24
														
 
															+    t2u_encoder_layers: 6
														
 
															+    t2u_decoder_layers: 6
														
 
															+    unit_vocabulary_size: 10082
														
 
															   from_model: null
														
 
															   from_model_config: null
														
 
															   pretrained_s2t_decoder_path: /fsx-ust/spopuri/datasets/PT_CKPT/S2T/S2T_M4T_V1_V1_cleaned.pt
														
@@ -45,10 +58,11 @@ train_data:
 
															   manifest_list_path: /data/home/mavlyutov/train_configs/m4t_v1_train_manifests.txt
														
 
															   manifest_path_prefix: /fsx-ust/spopuri/datasets/S2ST/V1/M4T_V1_phase2/primary 
														
 
															   max_seconds_per_input_audio: 15
														
 
															-  fixed_batch_size: null 
														
 
															-  max_tgt_text_tokens_per_batch: 600
														
 
															-  max_tgt_text_tokens_per_sample: 300
														
 
															-  max_units_per_sample: 1500
														
 
															+  fixed_batch_size: null
														
 
															+  max_batch_size: 25
														
 
															+  max_tgt_text_tokens_per_batch: 300
														
 
															+  max_tgt_text_tokens_per_sample: 150
														
 
															+  max_units_per_sample: 1200
														
 
															   num_threads: 10 
														
 
															   prefech_batches: 10
														
 
															   prepend_tgt_lang_tag: true
														
@@ -63,11 +77,11 @@ train_data:
 
															     num_units: null
														
 
															   unit_tokenizer_name: seamlessM4T_large
														
 
															 training:
														
 
															-  eval_steps: 5000 
														
 
															-  float_dtype: fp16
														
 
															+  eval_steps: 1000
														
 
															+  float_dtype: bf16
														
 
															   label_smoothing: 0.2
														
 
															-  learning_rate: 0.0001
														
 
															-  log_steps: 200 
														
 
															+  learning_rate: 0.00005
														
 
															+  log_steps: 200
														
 
															   max_epochs: 100
														
 
															   patience: 10
														
 
															   start_learning_rate: 1.0e-07
														
--- a/scripts/m4t/train/run_with_slurm.py
+++ b/scripts/m4t/train/run_with_slurm.py
@@ -93,7 +93,7 @@ def prepare_sbatch_config(
 
															 #SBATCH --ntasks-per-node=1
														
 
															 ## amount of mem
														
 
															-#SBATCH --mem 50G
														
 
															+#SBATCH --mem 500G
														
 
															 ## amount of time in minutes
														
 
															 #SBATCH --time 2400
														
--- a/scripts/m4t/train/trainer.py
+++ b/scripts/m4t/train/trainer.py
@@ -326,7 +326,7 @@ class UnitYTrainer:
 
															                     loss_val = float("Inf")
														
 
															                 else:
														
 
															                     loss_val = loss.item()
														
 
															-                del batch  # force memory release
														
 
															+                self._release_memory(batch)
														
 
															                 loss_hist.update(1, loss_val)
														
 
															         eval_loss = loss_hist.reduce()
														
 
															         self._update_eval_stats(eval_loss)
														
@@ -369,6 +369,11 @@ class UnitYTrainer:
 
															         self.train_loss_hist.update(1, loss.item())
														
 
															         self.batch_sizes.append(batch.speech_to_text.src_tokens.shape[0])
														
 
															         self._train_step_log()
														
 
															+        self._release_memory(batch)
														
 
															+
														
 
															+    def _release_memory(self, batch: dataloader.MultimodalSeqsBatch) -> None:
														
 
															+        """ Explicitly release large memory consumers """
														
 
															+        del batch
														
 
															     def _get_state(self) -> Dict[str, Any]:
														
 
															         model_state_dict = self.model.state_dict()