1 year ago · d877073d7c
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@ setup(
 
				         "fairseq2==0.2.*",
			
 
				         "librosa",
			
 
				         "openai-whisper",
			
 
				-        "simuleval",
			
 
				+        "simuleval~=1.1.1",
			
 
				         "soundfile",
			
 
				         "torchaudio",
			
 
				         "tqdm",
			
--- a/src/seamless_communication/streaming/agents/common.py
+++ b/src/seamless_communication/streaming/agents/common.py
@@ -0,0 +1,34 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+"""
			
 
				+Mixins + common for fairseq2 simuleval agents
			
 
				+"""
			
 
				+
			
 
				+from simuleval.data.segments import Segment
			
 
				+from simuleval.agents.states import AgentStates as AgentStatesOrig
			
 
				+
			
 
				+
			
 
				+class EarlyStoppingMixin:
			
 
				+    def reset_early(self) -> None:
			
 
				+        """
			
 
				+        Implement to override for different behavior on a reset that
			
 
				+        happens before EOS
			
 
				+        """
			
 
				+        raise NotImplementedError()
			
 
				+
			
 
				+
			
 
				+class AgentStates(AgentStatesOrig):
			
 
				+    def update_target(self, segment: Segment):
			
 
				+        """An AgentStates impl which doesn't update states.target"""
			
 
				+        self.target_finished = segment.finished
			
 
				+
			
 
				+
			
 
				+class NoUpdateTargetMixin:
			
 
				+    """A shortcut to make agents default to the AgentStates impl above"""
			
 
				+
			
 
				+    def build_states(self) -> AgentStates:
			
 
				+        return AgentStates()
			
--- a/src/seamless_communication/streaming/agents/detokenizer.py
+++ b/src/seamless_communication/streaming/agents/detokenizer.py
@@ -10,10 +10,13 @@ from typing import Any, Dict
 
				 
			
 
				 from simuleval.agents import TextToTextAgent
			
 
				 from simuleval.agents.actions import Action, ReadAction, WriteAction
			
 
				-from simuleval.agents.states import AgentStates
			
 
				+from seamless_communication.streaming.agents.common import (
			
 
				+    AgentStates,
			
 
				+    NoUpdateTargetMixin,
			
 
				+)
			
 
				 
			
 
				 
			
 
				-class DetokenizerAgent(TextToTextAgent):
			
 
				+class DetokenizerAgent(TextToTextAgent, NoUpdateTargetMixin):
			
 
				     def __init__(self, args: Namespace):
			
 
				         super().__init__(args)
			
 
				         self.detokenize_only = args.detokenize_only
			
--- a/src/seamless_communication/streaming/agents/mixins.py
+++ b/src/seamless_communication/streaming/agents/mixins.py
@@ -1,18 +0,0 @@
 
				-# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				-# All rights reserved.
			
 
				-#
			
 
				-# This source code is licensed under the license found in the
			
 
				-# LICENSE file in the root directory of this source tree.
			
 
				-
			
 
				-"""
			
 
				-Mixins for fairseq2 simuleval agents
			
 
				-"""
			
 
				-
			
 
				-
			
 
				-class EarlyStoppingMixin:
			
 
				-    def reset_early(self) -> None:
			
 
				-        """
			
 
				-        Implement to override for different behavior on a reset that
			
 
				-        happens before EOS
			
 
				-        """
			
 
				-        raise NotImplementedError()
			
--- a/src/seamless_communication/streaming/agents/offline_w2v_bert_encoder.py
+++ b/src/seamless_communication/streaming/agents/offline_w2v_bert_encoder.py
@@ -15,12 +15,16 @@ from fairseq2.data.text import TextTokenizer
 
				 from fairseq2.models.wav2vec2 import Wav2Vec2EncoderConfig
			
 
				 from fairseq2.nn.padding import get_seqs_and_padding_mask
			
 
				 from seamless_communication.models.unity.model import UnitYModel
			
 
				-from simuleval.agents import AgentStates, SpeechToSpeechAgent
			
 
				+from simuleval.agents import SpeechToSpeechAgent
			
 
				 from simuleval.agents.actions import Action, ReadAction, WriteAction
			
 
				 from simuleval.data.segments import SpeechSegment
			
 
				+from seamless_communication.streaming.agents.common import (
			
 
				+    AgentStates,
			
 
				+    NoUpdateTargetMixin,
			
 
				+)
			
 
				 
			
 
				 
			
 
				-class OfflineWav2VecBertEncoderAgent(SpeechToSpeechAgent):
			
 
				+class OfflineWav2VecBertEncoderAgent(SpeechToSpeechAgent, NoUpdateTargetMixin):
			
 
				     """
			
 
				     Incremental encoding of an wav2vec encoder output
			
 
				     It update the whole encoder states every time when there is a new incoming segment.
			
--- a/src/seamless_communication/streaming/agents/online_feature_extractor.py
+++ b/src/seamless_communication/streaming/agents/online_feature_extractor.py
@@ -16,8 +16,8 @@ from fairseq2.data.audio import WaveformToFbankConverter, WaveformToFbankInput
 
				 
			
 
				 from simuleval.agents import SpeechToSpeechAgent
			
 
				 from simuleval.agents.actions import Action, ReadAction, WriteAction
			
 
				-from simuleval.agents.states import AgentStates
			
 
				 from simuleval.data.segments import Segment, SpeechSegment
			
 
				+from seamless_communication.streaming.agents.common import AgentStates
			
 
				 
			
 
				 
			
 
				 SHIFT_SIZE = 10
			
--- a/src/seamless_communication/streaming/agents/online_text_decoder.py
+++ b/src/seamless_communication/streaming/agents/online_text_decoder.py
@@ -17,7 +17,7 @@ from seamless_communication.models.monotonic_decoder import (
 
				 )
			
 
				 from simuleval.agents import GenericAgent
			
 
				 from simuleval.agents.actions import Action, ReadAction, WriteAction
			
 
				-from simuleval.agents.states import AgentStates
			
 
				+from seamless_communication.streaming.agents.common import AgentStates
			
 
				 from simuleval.data.segments import Segment, TextSegment
			
 
				 from torch import Tensor
			
 
				 
			
--- a/src/seamless_communication/streaming/agents/silero_vad.py
+++ b/src/seamless_communication/streaming/agents/silero_vad.py
@@ -17,8 +17,11 @@ from typing import Any, List, Optional, Union
 
				 import numpy as np
			
 
				 import torch
			
 
				 import soundfile
			
 
				-from seamless_communication.streaming.agents.mixins import EarlyStoppingMixin
			
 
				-from simuleval.agents import AgentStates, SpeechToSpeechAgent
			
 
				+from seamless_communication.streaming.agents.common import (
			
 
				+    AgentStates,
			
 
				+    EarlyStoppingMixin,
			
 
				+)
			
 
				+from simuleval.agents import SpeechToSpeechAgent
			
 
				 from simuleval.agents.actions import Action, ReadAction, WriteAction
			
 
				 from simuleval.data.segments import EmptySegment, Segment, SpeechSegment
			
 
				 
			
--- a/src/seamless_communication/streaming/agents/unity_pipeline.py
+++ b/src/seamless_communication/streaming/agents/unity_pipeline.py
@@ -22,8 +22,11 @@ from seamless_communication.models.unity import (
 
				     load_unity_text_tokenizer,
			
 
				     load_unity_unit_tokenizer,
			
 
				 )
			
 
				-from seamless_communication.streaming.agents.mixins import EarlyStoppingMixin
			
 
				-from simuleval.agents import AgentPipeline, AgentStates
			
 
				+from seamless_communication.streaming.agents.common import (
			
 
				+    AgentStates,
			
 
				+    EarlyStoppingMixin,
			
 
				+)
			
 
				+from simuleval.agents import AgentPipeline
			
 
				 from simuleval.agents.agent import GenericAgent
			
 
				 from simuleval.data.segments import Segment