2 vuotta sitten · 157a1628f0
--- a/src/seamless_communication/models/unity/generator.py
+++ b/src/seamless_communication/models/unity/generator.py
@@ -219,7 +219,7 @@ class UnitYGenerator:
 
															             )
														
 
															             unit_seqs, _ = unit_gen_output.collate()
														
 
															         else:
														
 
															-            unit_decoder_output = self.model.t2u_model(
														
 
															+            unit_decoder_output, decoder_padding_mask = self.model.t2u_model(
														
 
															                 text_decoder_output=decoder_output,
														
 
															                 text_decoder_padding_mask=decoder_padding_mask,
														
 
															                 target_seqs=None,
														
@@ -228,6 +228,8 @@ class UnitYGenerator:
 
															             )
														
 
															             # (B, S_unit, V_unit)
														
 
															             unit_seqs = unit_decoder_output.logits.argmax(dim=2)
														
 
															+            # Apply the padding mask to the generated units.
														
 
															+            unit_seqs[decoder_padding_mask == -torch.inf] = unit_decoder_output.pad_idx
														
 
															         # Convert to speech units.
														
 
															         units = self.unit_decoder(unit_seqs)
														
--- a/src/seamless_communication/models/unity/length_regulator.py
+++ b/src/seamless_communication/models/unity/length_regulator.py
@@ -14,6 +14,7 @@ from fairseq2.typing import DataType, Device
 
															 from fairseq2.nn.transformer import create_default_layer_norm
														
 
															 from fairseq2.nn.normalization import LayerNorm
														
 
															 from fairseq2.nn.projection import Linear
														
 
															+from fairseq2.nn.utils.mask import apply_padding_mask
														
 
															 class HardUpsampling(Module):
														
@@ -177,8 +178,8 @@ class VarianceAdaptor(Module):
 
															             torch.round((torch.exp(log_durations) - 1) * duration_factor).long(),
														
 
															             min=min_duration,
														
 
															         )
														
 
															-        if padding_mask is not None:
														
 
															-            durations.masked_fill_(padding_mask, 0)
														
 
															+
														
 
															+        durations = apply_padding_mask(durations, padding_mask)
														
 
															         # TODO: Implement pitch, energy predictors.
														
 
															         # TODO: Implement GaussianUpsampling.
														
--- a/src/seamless_communication/models/unity/model.py
+++ b/src/seamless_communication/models/unity/model.py
@@ -379,7 +379,7 @@ class UnitYNART2UModel(Module):
 
															         target_seqs: Optional[Tensor],
														
 
															         target_seq_lens: Optional[Tensor],
														
 
															         text_seqs: Optional[Tensor],
														
 
															-    ) -> SequenceModelOutput:
														
 
															+    ) -> Tuple[SequenceModelOutput, Optional[Tensor]]:
														
 
															         encoder_output, encoder_padding_mask = self.encode(
														
 
															             text_decoder_output, text_decoder_padding_mask
														
 
															         )
														
@@ -392,7 +392,7 @@ class UnitYNART2UModel(Module):
 
															             text_seqs,
														
 
															         )
														
 
															-        return self.project(decoder_output, decoder_padding_mask)
														
 
															+        return self.project(decoder_output), decoder_padding_mask
														
 
															     def encode(
														
 
															         self,
														
@@ -420,9 +420,7 @@ class UnitYNART2UModel(Module):
 
															         return self.decoder(seqs, padding_mask)  # type: ignore[no-any-return]
														
 
															-    def project(
														
 
															-        self, decoder_output: Tensor, decoder_padding_mask: Optional[Tensor]
														
 
															-    ) -> SequenceModelOutput:
														
 
															+    def project(self, decoder_output: Tensor) -> SequenceModelOutput:
														
 
															         logits = self.final_proj(decoder_output)
														
 
															         return SequenceModelOutput(logits, self.pad_idx)
														
--- a/src/seamless_communication/models/unity/nar_decoder_frontend.py
+++ b/src/seamless_communication/models/unity/nar_decoder_frontend.py
@@ -62,8 +62,11 @@ class TagManager:
 
															         """Add back 0s in place of the prefix, suffix tokens."""
														
 
															         N = dur_or_len.shape[0]
														
 
															-        prefix = dur_or_len.new_zeros((N, self.prefix_len))
														
 
															-        suffix = dur_or_len.new_zeros((N, self.suffix_len))
														
 
															+        # prefix = dur_or_len.new_zeros((N, self.prefix_len))
														
 
															+        # suffix = dur_or_len.new_zeros((N, self.suffix_len))
														
 
															+
														
 
															+        prefix = dur_or_len.new_zeros((N, 1))
														
 
															+        suffix = dur_or_len.new_zeros((N, 1))
														
 
															         dur_or_len = torch.cat([prefix, dur_or_len, suffix], dim=1)
														
 
															         return dur_or_len