1 year ago · 6fadf9e320
--- a/src/seamless_communication/inference/generator.py
+++ b/src/seamless_communication/inference/generator.py
@@ -252,7 +252,7 @@ class UnitYGenerator:
 
				             )
			
 
				             unit_seqs, _ = unit_gen_output.collate()
			
 
				         else:
			
 
				-            unit_decoder_output, decoder_padding_mask, _ = self.model.t2u_model(
			
 
				+            t2u_model_output, decoder_padding_mask, _ = self.model.t2u_model(
			
 
				                 text_decoder_output=decoder_output,
			
 
				                 text_decoder_padding_mask=decoder_padding_mask,
			
 
				                 text_seqs=text_seqs,
			
@@ -260,10 +260,10 @@ class UnitYGenerator:
 
				                 film_cond_emb=prosody_encoder_out,
			
 
				             )
			
 
				             # (B, S_unit, V_unit)
			
 
				-            unit_seqs = unit_decoder_output.logits.argmax(dim=2)
			
 
				+            unit_seqs = t2u_model_output.logits.argmax(dim=2)
			
 
				             # Apply the padding mask to the generated units.
			
 
				             unit_seqs = apply_padding_mask(
			
 
				-                unit_seqs, decoder_padding_mask, unit_decoder_output.vocab_info.pad_idx
			
 
				+                unit_seqs, decoder_padding_mask, t2u_model_output.vocab_info.pad_idx
			
 
				             )
			
 
				 
			
 
				         # Convert to speech units.
			
--- a/src/seamless_communication/streaming/agents/__init__.py
+++ b/src/seamless_communication/streaming/agents/__init__.py
@@ -4,6 +4,9 @@
 
				 # This source code is licensed under the license found in the
			
 
				 # LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				+from seamless_communication.streaming.agents.mma_m4t_s2st import (
			
 
				+    MonotonicM4TS2STAgent as MonotonicM4TS2STAgent,
			
 
				+)
			
 
				 from seamless_communication.streaming.agents.mma_m4t_s2t import (
			
 
				     MonotonicM4TS2TAgent as MonotonicM4TS2TAgent,
			
 
				 )
			
--- a/src/seamless_communication/streaming/agents/mma_m4t_s2st.py
+++ b/src/seamless_communication/streaming/agents/mma_m4t_s2st.py
@@ -0,0 +1,30 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+from seamless_communication.streaming.agents.offline_w2v_bert_encoder import (
			
 
				+    OfflineWav2VecBertEncoderAgent,
			
 
				+)
			
 
				+from seamless_communication.streaming.agents.online_feature_extractor import (
			
 
				+    OnlineFeatureExtractorAgent,
			
 
				+)
			
 
				+from seamless_communication.streaming.agents.online_text_decoder import (
			
 
				+    UnitYMMATextDecoderAgent,
			
 
				+)
			
 
				+from seamless_communication.streaming.agents.online_unit_decoder import (
			
 
				+    NARUnitYUnitDecoderAgent,
			
 
				+)
			
 
				+from seamless_communication.streaming.agents.unity_pipeline import UnitYAgentPipeline
			
 
				+from simuleval.utils import entrypoint
			
 
				+
			
 
				+
			
 
				+@entrypoint
			
 
				+class MonotonicM4TS2STAgent(UnitYAgentPipeline):
			
 
				+    pipeline = [
			
 
				+        OnlineFeatureExtractorAgent,
			
 
				+        OfflineWav2VecBertEncoderAgent,
			
 
				+        UnitYMMATextDecoderAgent,
			
 
				+        NARUnitYUnitDecoderAgent,
			
 
				+    ]
			
--- a/src/seamless_communication/streaming/agents/online_text_decoder.py
+++ b/src/seamless_communication/streaming/agents/online_text_decoder.py
@@ -6,7 +6,8 @@
 
				 from __future__ import annotations
			
 
				 
			
 
				 from argparse import ArgumentParser, Namespace
			
 
				-from typing import Any, Dict, List, Set, Tuple
			
 
				+from dataclasses import dataclass
			
 
				+from typing import Any, Dict, List, Optional, Set, Tuple
			
 
				 
			
 
				 import torch
			
 
				 from fairseq2.models.nllb.tokenizer import NllbTokenizer
			
@@ -77,13 +78,19 @@ class OnlineTextDecoderAgent(GenericAgent):
 
				         self.device = args.device
			
 
				         self.dtype = args.dtype
			
 
				         self.eos_idx = text_tokenizer.vocab_info.eos_idx
			
 
				-        if getattr(args, "tgt_lang", None) and getattr(args, "prefix_tgt_lang", None):
			
 
				+        if (
			
 
				+            hasattr(args, "tgt_lang")
			
 
				+            and hasattr(args, "prefix_tgt_lang")
			
 
				+            and args.tgt_lang is not None
			
 
				+            and args.prefix_tgt_lang is not None
			
 
				+        ):
			
 
				             assert args.tgt_lang == args.prefix_tgt_lang
			
 
				         tgt_lang = getattr(args, "tgt_lang", None) or getattr(
			
 
				             args, "prefix_tgt_lang", None
			
 
				         )
			
 
				-        token_encoder = text_tokenizer.create_encoder(lang=tgt_lang, mode="target")
			
 
				-        prefix_indices = token_encoder.prefix_indices
			
 
				+        assert tgt_lang is not None
			
 
				+        self.token_encoder = text_tokenizer.create_encoder(lang=tgt_lang, mode="target")
			
 
				+        prefix_indices = self.token_encoder.prefix_indices
			
 
				         assert prefix_indices is not None
			
 
				         self.prefix_indices: List[int] = prefix_indices.tolist()
			
 
				 
			
@@ -218,8 +225,6 @@ class MMATextDecoderAgent(OnlineTextDecoderAgent):
 
				                 pred_indices[-1:], device=self.device, dtype=torch.int64
			
 
				             ).unsqueeze(0)
			
 
				 
			
 
				-        torch.cuda.empty_cache()
			
 
				-
			
 
				         encoder_output = states.source
			
 
				         decoder_output, _, p_choose = self.model.decode(
			
 
				             target_input, None, encoder_output, None, state_bag=self.state_bag
			
@@ -246,7 +251,11 @@ class MMATextDecoderAgent(OnlineTextDecoderAgent):
 
				         return index, prob, decoder_output
			
 
				 
			
 
				     def postprocess(
			
 
				-        self, states: DecoderAgentStates, pred_indices: Tensor, finished: bool
			
 
				+        self,
			
 
				+        states: DecoderAgentStates,
			
 
				+        pred_indices: List[int],
			
 
				+        finished: bool,
			
 
				+        decoder_features_out: Optional[Tensor] = None,
			
 
				     ) -> TextSegment:
			
 
				         return TextSegment(
			
 
				             content=" ".join(
			
@@ -319,12 +328,19 @@ class MMATextDecoderAgent(OnlineTextDecoderAgent):
 
				         prob = None
			
 
				         finished = False
			
 
				         blocked_ngrams = self.get_blocked_ngrams(states.target_indices)
			
 
				+        decoder_features_out = None
			
 
				 
			
 
				         while (
			
 
				             len(states.target_indices + pred_indices) < self.max_len(states)
			
 
				             and len(pred_indices) < self.max_consecutive_writes
			
 
				         ):
			
 
				-            index, prob, _ = self.run_decoder(states, pred_indices)
			
 
				+            index, prob, decoder_features = self.run_decoder(states, pred_indices)
			
 
				+
			
 
				+            if decoder_features_out is None:
			
 
				+                decoder_features_out = decoder_features.new(0)
			
 
				+            decoder_features_out = torch.cat(
			
 
				+                [decoder_features_out, decoder_features], dim=1
			
 
				+            )
			
 
				 
			
 
				             if (
			
 
				                 self.no_early_stop
			
@@ -366,7 +382,7 @@ class MMATextDecoderAgent(OnlineTextDecoderAgent):
 
				             ) > self.max_len(states)
			
 
				             states.ngram_block_count = 0
			
 
				             return WriteAction(
			
 
				-                self.postprocess(states, torch.tensor(pred_indices), finished),
			
 
				+                self.postprocess(states, pred_indices, finished, decoder_features_out),
			
 
				                 finished=finished,
			
 
				             )
			
 
				         else:
			
@@ -375,3 +391,56 @@ class MMATextDecoderAgent(OnlineTextDecoderAgent):
 
				 
			
 
				 class MMASpeechToTextDecoderAgent(MMATextDecoderAgent):
			
 
				     source_type = "speech"
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class UnitYTextDecoderOutput:
			
 
				+    decoder_features: Tensor
			
 
				+    tokens: List[str]
			
 
				+    target_indices: Optional[Tensor] = None
			
 
				+
			
 
				+
			
 
				+class UnitYMMATextDecoderAgent(MMASpeechToTextDecoderAgent):
			
 
				+    """
			
 
				+    MMA UnitY text decoder agent which just prepares the decoder
			
 
				+    output for the downstream agent.
			
 
				+    """
			
 
				+
			
 
				+    def postprocess(
			
 
				+        self,
			
 
				+        states: DecoderAgentStates,
			
 
				+        pred_indices: List[int],
			
 
				+        finished: bool,
			
 
				+        decoder_features_out: Optional[Tensor] = None,
			
 
				+    ) -> TextSegment:
			
 
				+        tokens: List[str] = [
			
 
				+            self.text_tokenizer.model.index_to_token(idx) for idx in pred_indices
			
 
				+        ]
			
 
				+        assert decoder_features_out is not None
			
 
				+        token_list = self.prefix_indices + states.target_indices
			
 
				+        if (
			
 
				+            len(pred_indices) > 0
			
 
				+            and pred_indices[-1] != self.text_tokenizer.vocab_info.eos_idx
			
 
				+        ):
			
 
				+            # Append "," to make speech smooth
			
 
				+            # TODO: a temporary solution.
			
 
				+            ending_token_index = self.text_tokenizer.model.token_to_index(",")
			
 
				+            token_list.append(ending_token_index)
			
 
				+            self.state_bag.increment_step()
			
 
				+
			
 
				+            _, _, decoder_features = self.run_decoder(states, [ending_token_index])
			
 
				+            decoder_features_out = torch.cat(
			
 
				+                [decoder_features_out, decoder_features], dim=1
			
 
				+            )
			
 
				+
			
 
				+        target_input = torch.tensor(
			
 
				+            token_list,
			
 
				+            device=self.device,
			
 
				+            dtype=torch.int64,
			
 
				+        ).unsqueeze(0)
			
 
				+
			
 
				+        return TextSegment(
			
 
				+            content=UnitYTextDecoderOutput(decoder_features_out, tokens, target_input),
			
 
				+            finished=finished,
			
 
				+            tgt_lang=states.tgt_lang,
			
 
				+        )
			
--- a/src/seamless_communication/streaming/agents/online_unit_decoder.py
+++ b/src/seamless_communication/streaming/agents/online_unit_decoder.py
@@ -0,0 +1,156 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+from argparse import ArgumentParser, Namespace
			
 
				+from typing import Any, List, Optional
			
 
				+
			
 
				+import torch
			
 
				+from seamless_communication.models.unity.model import UnitYModel, UnitYNART2UModel
			
 
				+from seamless_communication.models.unity.unit_tokenizer import UnitTokenizer
			
 
				+from seamless_communication.streaming.agents.online_text_decoder import (
			
 
				+    UnitYTextDecoderOutput,
			
 
				+)
			
 
				+from simuleval.agents import GenericAgent
			
 
				+from simuleval.agents.actions import Action, ReadAction, WriteAction
			
 
				+from simuleval.agents.states import AgentStates
			
 
				+from simuleval.data.segments import Segment, TextSegment
			
 
				+
			
 
				+
			
 
				+class NARUnitDecoderAgentStates(AgentStates):
			
 
				+    def reset(self) -> None:
			
 
				+        self.source_token_list: List[str] = []
			
 
				+        self.source_indices: Optional[torch.Tensor] = None
			
 
				+        self.duration_start_index: int = 0
			
 
				+        self.tgt_lang = None
			
 
				+        super().reset()
			
 
				+
			
 
				+    def update_source(self, segment: Segment) -> None:
			
 
				+        """
			
 
				+        Update states from input segment
			
 
				+        Additionlly update incremental states
			
 
				+
			
 
				+        Args:
			
 
				+            segment (~simuleval.agents.segments.Segment): input segment
			
 
				+        """
			
 
				+        self.source_finished = segment.finished
			
 
				+        if self.tgt_lang is None and segment.tgt_lang is not None:
			
 
				+            self.tgt_lang = segment.tgt_lang
			
 
				+        if segment.is_empty:
			
 
				+            if segment.finished:
			
 
				+                self.target_finished = True
			
 
				+            return
			
 
				+        segment_content: UnitYTextDecoderOutput = segment.content
			
 
				+        content = segment_content.decoder_features
			
 
				+        token = segment_content.tokens
			
 
				+        self.source_indices = segment_content.target_indices
			
 
				+        self.source_token_list += token
			
 
				+        self.source = content
			
 
				+
			
 
				+
			
 
				+class NARUnitYUnitDecoderAgent(GenericAgent):
			
 
				+    """Non-autoregressive unit decoder"""
			
 
				+
			
 
				+    source_type = "text"
			
 
				+    target_type = "text"
			
 
				+
			
 
				+    def __init__(
			
 
				+        self, model: UnitYNART2UModel, tokenizer: UnitTokenizer, args: Namespace
			
 
				+    ) -> None:
			
 
				+        self.model = model
			
 
				+        self.tokenizer = tokenizer
			
 
				+        self.min_unit_chunk_size = args.min_unit_chunk_size
			
 
				+        self.d_factor = args.d_factor
			
 
				+        self.device = args.device
			
 
				+        self.dtype = args.dtype
			
 
				+        self.token_decoder = self.tokenizer.create_decoder()
			
 
				+        super().__init__(args)
			
 
				+
			
 
				+    def build_states(self) -> NARUnitDecoderAgentStates:
			
 
				+        return NARUnitDecoderAgentStates()
			
 
				+
			
 
				+    @property
			
 
				+    def max_len(self) -> int:
			
 
				+        return 200
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def add_args(parser: ArgumentParser) -> None:
			
 
				+        parser.add_argument(
			
 
				+            "--min-unit-chunk-size",
			
 
				+            type=int,
			
 
				+            required=True,
			
 
				+            help="Minimal units to produce every chunk",
			
 
				+        )
			
 
				+        parser.add_argument(
			
 
				+            "--d-factor",
			
 
				+            type=float,
			
 
				+            default=1.0,
			
 
				+            help="scaling factor for duration prediction",
			
 
				+        )
			
 
				+
			
 
				+    @torch.inference_mode()
			
 
				+    def policy(self, states: NARUnitDecoderAgentStates) -> Action:
			
 
				+        if states.target_finished:
			
 
				+            return WriteAction("", finished=True)
			
 
				+
			
 
				+        if len(states.source_token_list) < 2:
			
 
				+            if not states.source_finished:
			
 
				+                return ReadAction()
			
 
				+            else:
			
 
				+                return WriteAction("", finished=True)
			
 
				+
			
 
				+        model_output, _, durations = self.model(
			
 
				+            text_decoder_output=states.source,
			
 
				+            text_decoder_padding_mask=None,
			
 
				+            text_seqs=states.source_indices,
			
 
				+            duration_factor=self.d_factor,
			
 
				+        )
			
 
				+        durations = durations[0]
			
 
				+
			
 
				+        if states.source_finished and states.duration_start_index > 0:
			
 
				+            # We have to consider one more word for EOS, because we append an EOS at the end.
			
 
				+            if sum(durations[states.duration_start_index :]) == 0:
			
 
				+                # If you reach here, it means that the last source token is a silence token (e.g. punctuations)
			
 
				+                # In that case no need to consider one more token.
			
 
				+                return WriteAction("", finished=True)
			
 
				+            else:
			
 
				+                states.duration_start_index = max(states.duration_start_index - 1, 0)
			
 
				+
			
 
				+        current_duration = sum(durations[states.duration_start_index :])
			
 
				+
			
 
				+        if current_duration < self.min_unit_chunk_size:
			
 
				+            if not states.source_finished:
			
 
				+                # if current untranslated source result less than self.min_unit_chunk_size units
			
 
				+                return ReadAction()
			
 
				+            else:
			
 
				+                if current_duration == 0:
			
 
				+                    return WriteAction("", finished=True)
			
 
				+
			
 
				+        unit_seqs = model_output.logits[0].argmax(dim=-1)
			
 
				+        index_start_offset = sum(durations[: states.duration_start_index])
			
 
				+        unit_seqs = unit_seqs[index_start_offset:].unsqueeze(0)
			
 
				+        units = self.token_decoder(unit_seqs)
			
 
				+
			
 
				+        # minus one because we add a ending_token on each s2t output phrase
			
 
				+        states.duration_start_index = len(durations) - 1
			
 
				+
			
 
				+        return WriteAction(
			
 
				+            TextSegment(
			
 
				+                content=units,
			
 
				+                finished=states.source_finished,
			
 
				+                tgt_lang=states.tgt_lang,
			
 
				+            ),
			
 
				+            finished=states.source_finished,
			
 
				+        )
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_args(cls, args: Namespace, **kwargs: Any) -> NARUnitYUnitDecoderAgent:
			
 
				+        unity_model: UnitYModel = kwargs.get("unity_model", None)
			
 
				+        unit_tokenizer: UnitTokenizer = kwargs.get("unit_tokenizer", None)
			
 
				+        assert unity_model.t2u_model is not None and isinstance(
			
 
				+            unity_model.t2u_model, UnitYNART2UModel
			
 
				+        )
			
 
				+        return cls(model=unity_model.t2u_model, tokenizer=unit_tokenizer, args=args)
			
--- a/src/seamless_communication/streaming/agents/unity_pipeline.py
+++ b/src/seamless_communication/streaming/agents/unity_pipeline.py
@@ -62,13 +62,13 @@ class UnitYPipelineMixin:
 
				             "--unity-model-name",
			
 
				             type=str,
			
 
				             help="Unity model name.",
			
 
				-            default="unity_sans_decoder",
			
 
				+            default="seamless_streaming_unity",
			
 
				         )
			
 
				         parser.add_argument(
			
 
				             "--monotonic-decoder-model-name",
			
 
				             type=str,
			
 
				             help="Monotonic decoder model name.",
			
 
				-            default="monotonic_decoder",
			
 
				+            default="seamless_streaming_monotonic_decoder",
			
 
				         )
			
 
				         parser.add_argument(
			
 
				             "--sample-rate",