hai 1 ano · bc88690d56
--- a/src/seamless_communication/streaming/agents/online_text_decoder.py
+++ b/src/seamless_communication/streaming/agents/online_text_decoder.py
@@ -5,28 +5,26 @@
 
															 # LICENSE file in the root directory of this source tree.
														
 
															 from __future__ import annotations
														
 
															-import torch
														
 
															-
														
 
															 from argparse import ArgumentParser, Namespace
														
 
															-from torch import Tensor
														
 
															 from typing import Any, Dict, List, Tuple
														
 
															+import torch
														
 
															 from fairseq2.models.nllb.tokenizer import NllbTokenizer
														
 
															 from fairseq2.nn.incremental_state import IncrementalStateBag
														
 
															 from seamless_communication.models.monotonic_decoder import (
														
 
															     MonotonicDecoderConfig,
														
 
															     MonotonicDecoderModel,
														
 
															 )
														
 
															-
														
 
															 from simuleval.agents import GenericAgent
														
 
															 from simuleval.agents.actions import Action, ReadAction, WriteAction
														
 
															 from simuleval.agents.states import AgentStates
														
 
															 from simuleval.data.segments import Segment, TextSegment
														
 
															+from torch import Tensor
														
 
															 class DecoderAgentStates(AgentStates):
														
 
															     def reset(self) -> None:
														
 
															-        self.source_steps = 0
														
 
															+        self.source_len = 0
														
 
															         self.target_indices: List[int] = []
														
 
															         self.tgt_lang = None
														
 
															         super().reset()
														
@@ -47,7 +45,7 @@ class DecoderAgentStates(AgentStates):
 
															             if len(self.source) == 0 and segment.finished:
														
 
															                 self.target_finished = True
														
 
															                 return
														
 
															-            self.source_steps = self.source.size(1)
														
 
															+            self.source_len = self.source.size(1)
														
 
															 class OnlineTextDecoderAgent(GenericAgent):
														
@@ -80,9 +78,9 @@ class OnlineTextDecoderAgent(GenericAgent):
 
															         self.dtype = args.dtype
														
 
															         self.eos_idx = text_tokenizer.vocab_info.eos_idx
														
 
															         token_encoder = text_tokenizer.create_encoder(lang=args.tgt_lang, mode="target")
														
 
															-        prefix_tokens = token_encoder.prefix_indices
														
 
															-        assert prefix_tokens is not None
														
 
															-        self.prefix_tokens: List[int] = prefix_tokens.tolist()
														
 
															+        prefix_indices = token_encoder.prefix_indices
														
 
															+        assert prefix_indices is not None
														
 
															+        self.prefix_indices: List[int] = prefix_indices.tolist()
														
 
															     def build_states(self) -> DecoderAgentStates:
														
 
															         return DecoderAgentStates()
														
@@ -131,6 +129,12 @@ class OnlineTextDecoderAgent(GenericAgent):
 
															     def policy(self, states: DecoderAgentStates) -> Action:
														
 
															         raise NotImplementedError
														
 
															+    def enforce_tgt_lang_in_prefix(self, states: DecoderAgentStates) -> None:
														
 
															+        if states.tgt_lang:
														
 
															+            tgt_lang_tag = f"__{states.tgt_lang}__"
														
 
															+            tgt_lang_tag_idx = self.text_tokenizer.model.token_to_index(tgt_lang_tag)
														
 
															+            self.prefix_indices[-1] = tgt_lang_tag_idx
														
 
															+
														
 
															 class MMATextDecoderAgent(OnlineTextDecoderAgent):
														
 
															     def __init__(
														
@@ -194,8 +198,9 @@ class MMATextDecoderAgent(OnlineTextDecoderAgent):
 
															         self, states: DecoderAgentStates, pred_indices: List[int]
														
 
															     ) -> Tuple[int, float, Tensor]:
														
 
															         if len(pred_indices) == 0:
														
 
															+            self.enforce_tgt_lang_in_prefix(states)
														
 
															             target_input = torch.tensor(
														
 
															-                self.prefix_tokens + states.target_indices,
														
 
															+                self.prefix_indices + states.target_indices,
														
 
															                 device=self.device,
														
 
															                 dtype=torch.int64,
														
 
															             ).unsqueeze(0)
														
@@ -204,7 +209,6 @@ class MMATextDecoderAgent(OnlineTextDecoderAgent):
 
															                 pred_indices[-1:], device=self.device, dtype=torch.int64
														
 
															             ).unsqueeze(0)
														
 
															-        states.source_steps = states.source.size(1)
														
 
															         torch.cuda.empty_cache()
														
 
															         encoder_output = states.source
														
@@ -244,7 +248,7 @@ class MMATextDecoderAgent(OnlineTextDecoderAgent):
 
															         if len(states.source) == 0:
														
 
															             return ReadAction()
														
 
															-        if states.source_steps < self.min_starting_wait and not states.source_finished:
														
 
															+        if states.source_len < self.min_starting_wait and not states.source_finished:
														
 
															             return ReadAction()
														
 
															         if states.target_finished:
														
@@ -255,6 +259,8 @@ class MMATextDecoderAgent(OnlineTextDecoderAgent):
 
															         self.state_bag = IncrementalStateBag(4096)
														
 
															+        states.source_len = states.source.size(1)
														
 
															+
														
 
															         pred_indices: List[int] = []
														
 
															         index = None
														
 
															         prob = None
														
@@ -279,7 +285,7 @@ class MMATextDecoderAgent(OnlineTextDecoderAgent):
 
															             ):
														
 
															                 if prob == 1.0:
														
 
															                     pred_indices = []
														
 
															-                if states.source_steps < self.min_starting_wait_reset:
														
 
															+                if states.source_len < self.min_starting_wait_reset:
														
 
															                     pred_indices = []
														
 
															                     if len(states.target_indices) < 3:
														
 
															                         states.target_indices = []
														
@@ -302,7 +308,7 @@ class MMATextDecoderAgent(OnlineTextDecoderAgent):
 
															             pred_indices.append(index)
														
 
															             if self.state_bag.step == 0:
														
 
															                 self.state_bag.increment_step(
														
 
															-                    len(self.prefix_tokens + states.target_indices)
														
 
															+                    len(self.prefix_indices + states.target_indices)
														
 
															                 )
														
 
															             else:
														
 
															                 self.state_bag.increment_step()
														
--- a/src/seamless_communication/streaming/agents/unity_pipeline.py
+++ b/src/seamless_communication/streaming/agents/unity_pipeline.py
@@ -4,32 +4,29 @@
 
															 # This source code is licensed under the license found in the
														
 
															 # LICENSE file in the root directory of this source tree.
														
 
															 from __future__ import annotations
														
 
															-from simuleval.agents.agent import GenericAgent
														
 
															 import logging
														
 
															-import torch
														
 
															-
														
 
															 from argparse import ArgumentParser, Namespace
														
 
															 from typing import Any, List, Optional
														
 
															+import torch
														
 
															 from fairseq2.assets import asset_store
														
 
															-from seamless_communication.streaming.agents.mixins import EarlyStoppingMixin
														
 
															 from seamless_communication.inference.translator import Modality, Translator
														
 
															+from seamless_communication.models.monotonic_decoder import (
														
 
															+    load_monotonic_decoder_config,
														
 
															+    load_monotonic_decoder_model,
														
 
															+)
														
 
															 from seamless_communication.models.unity import (
														
 
															     load_unity_config,
														
 
															     load_unity_model,
														
 
															     load_unity_text_tokenizer,
														
 
															     load_unity_unit_tokenizer,
														
 
															 )
														
 
															-from seamless_communication.models.monotonic_decoder import (
														
 
															-    load_monotonic_decoder_model,
														
 
															-    load_monotonic_decoder_config,
														
 
															-)
														
 
															-
														
 
															+from seamless_communication.streaming.agents.mixins import EarlyStoppingMixin
														
 
															 from simuleval.agents import AgentPipeline, AgentStates
														
 
															+from simuleval.agents.agent import GenericAgent
														
 
															 from simuleval.data.segments import Segment
														
 
															-
														
 
															 logging.basicConfig(
														
 
															     level=logging.INFO,
														
 
															     format="%(asctime)s %(levelname)s -- %(name)s: %(message)s",
														
@@ -81,16 +78,11 @@ class UnitYPipelineMixin:
 
															             type=str,
														
 
															         )
														
 
															-    @classmethod
														
 
															-    def from_args(cls, args: Any) -> UnitYPipelineMixin:
														
 
															-        return cls()
														
 
															-
														
 
															 class UnitYAgentPipeline(UnitYPipelineMixin, AgentPipeline):
														
 
															     pipeline: List[GenericAgent] = []
														
 
															     def __init__(self, args: Namespace):
														
 
															-
														
 
															         if not torch.cuda.is_available() and "cuda" in args.device:
														
 
															             raise ValueError("CUDA not available, use CPU.")
														
@@ -175,3 +167,7 @@ class UnitYAgentPipeline(UnitYPipelineMixin, AgentPipeline):
 
															             output_segment.finished = False
														
 
															         return output_segment
														
 
															+
														
 
															+    @classmethod
														
 
															+    def from_args(cls, args: Any) -> UnitYPipelineMixin:
														
 
															+        return cls(args)