2 years ago · e568857c64
--- a/src/seamless_communication/cli/streaming/scorers/seamless_whisper_asr_bleu.py
+++ b/src/seamless_communication/cli/streaming/scorers/seamless_whisper_asr_bleu.py
@@ -30,7 +30,7 @@ def normalize_text_whisper(sentences: List[str], lang: str) -> List[str]:
 
				 
			
 
				 
			
 
				 @register_quality_scorer("SEAMLESS_WHISPER_ASR_BLEU")
			
 
				-class SeamlessWhisperASRSacreBLEUScorer(WhisperASRSacreBLEUScorer):
			
 
				+class SeamlessWhisperASRSacreBLEUScorer(WhisperASRSacreBLEUScorer):  # type: ignore
			
 
				     def __init__(
			
 
				         self,
			
 
				         tokenizer: str = "13a",
			
--- a/src/seamless_communication/streaming/agents/common.py
+++ b/src/seamless_communication/streaming/agents/common.py
@@ -21,7 +21,7 @@ class EarlyStoppingMixin:
 
				         raise NotImplementedError()
			
 
				 
			
 
				 
			
 
				-class AgentStates(AgentStatesOrig):
			
 
				+class AgentStates(AgentStatesOrig):  # type: ignore
			
 
				     def update_target(self, segment: Segment) -> None:
			
 
				         """An AgentStates impl which doesn't update states.target"""
			
 
				         self.target_finished = segment.finished
			
--- a/src/seamless_communication/streaming/agents/detokenizer.py
+++ b/src/seamless_communication/streaming/agents/detokenizer.py
@@ -16,7 +16,7 @@ from seamless_communication.streaming.agents.common import (
 
				 )
			
 
				 
			
 
				 
			
 
				-class DetokenizerAgent(NoUpdateTargetMixin, TextToTextAgent):
			
 
				+class DetokenizerAgent(NoUpdateTargetMixin, TextToTextAgent):  # type: ignore
			
 
				     def __init__(self, args: Namespace):
			
 
				         super().__init__(args)
			
 
				         self.detokenize_only = args.detokenize_only
			
--- a/src/seamless_communication/streaming/agents/offline_w2v_bert_encoder.py
+++ b/src/seamless_communication/streaming/agents/offline_w2v_bert_encoder.py
@@ -24,7 +24,7 @@ from seamless_communication.streaming.agents.common import (
 
				 )
			
 
				 
			
 
				 
			
 
				-class OfflineWav2VecBertEncoderAgent(NoUpdateTargetMixin, SpeechToSpeechAgent):
			
 
				+class OfflineWav2VecBertEncoderAgent(NoUpdateTargetMixin, SpeechToSpeechAgent):  # type: ignore
			
 
				     """
			
 
				     Incremental encoding of an wav2vec encoder output
			
 
				     It update the whole encoder states every time when there is a new incoming segment.
			
--- a/src/seamless_communication/streaming/agents/online_feature_extractor.py
+++ b/src/seamless_communication/streaming/agents/online_feature_extractor.py
@@ -26,7 +26,7 @@ SAMPLE_RATE = 16000
 
				 FEATURE_DIM = 80
			
 
				 
			
 
				 
			
 
				-class FeatureStates(AgentStates):
			
 
				+class FeatureStates(AgentStates):  # type: ignore
			
 
				     def reset(self) -> None:
			
 
				         super().reset()
			
 
				         self.previous_residual_samples: List[float] = []
			
@@ -45,7 +45,7 @@ class FeatureStates(AgentStates):
 
				             self.source.append(segment.content)
			
 
				 
			
 
				 
			
 
				-class OnlineFeatureExtractorAgent(SpeechToSpeechAgent):
			
 
				+class OnlineFeatureExtractorAgent(SpeechToSpeechAgent):  # type: ignore
			
 
				     """
			
 
				     Extract speech features on the fly.
			
 
				     """
			
--- a/src/seamless_communication/streaming/agents/online_text_decoder.py
+++ b/src/seamless_communication/streaming/agents/online_text_decoder.py
@@ -23,7 +23,7 @@ from simuleval.data.segments import Segment, TextSegment
 
				 from torch import Tensor
			
 
				 
			
 
				 
			
 
				-class DecoderAgentStates(AgentStates):
			
 
				+class DecoderAgentStates(AgentStates):  # type: ignore
			
 
				     def reset(self) -> None:
			
 
				         self.source_len = 0
			
 
				         self.target_indices: List[int] = []
			
@@ -50,7 +50,7 @@ class DecoderAgentStates(AgentStates):
 
				             self.source_len = self.source.size(1)
			
 
				 
			
 
				 
			
 
				-class OnlineTextDecoderAgent(GenericAgent):
			
 
				+class OnlineTextDecoderAgent(GenericAgent):  # type: ignore
			
 
				     """
			
 
				     Online text decoder
			
 
				     """
			
@@ -139,7 +139,7 @@ class OnlineTextDecoderAgent(GenericAgent):
 
				             self.prefix_indices[-1] = tgt_lang_tag_idx
			
 
				 
			
 
				 
			
 
				-class MMATextDecoderAgent(OnlineTextDecoderAgent):
			
 
				+class MMATextDecoderAgent(OnlineTextDecoderAgent):  # type: ignore
			
 
				     def __init__(
			
 
				         self,
			
 
				         model: MonotonicDecoderModel,
			
@@ -278,15 +278,17 @@ class MMATextDecoderAgent(OnlineTextDecoderAgent):
 
				         states: DecoderAgentStates,
			
 
				         pred_indices: List[int],
			
 
				         decoder_features_out: Tensor,
			
 
				-        blocked_ngrams: Set[str],
			
 
				+        blocked_ngrams: Optional[Set[str]],
			
 
				         index: int,
			
 
				-    ) -> bool:
			
 
				+    ) -> Tuple[bool, Tensor]:
			
 
				         """
			
 
				         This check is used to force a READ decision when n-gram repeat
			
 
				         happens before source_finished
			
 
				         """
			
 
				         if not self.block_ngrams or states.source_finished:
			
 
				             return False, decoder_features_out
			
 
				+
			
 
				+        assert blocked_ngrams is not None
			
 
				         all_indices = states.target_indices + pred_indices + [index]
			
 
				         for n in [3, 2]:  # TODO: make it configurable
			
 
				             if len(all_indices) >= n and states.ngram_block_count <= 4:
			
--- a/src/seamless_communication/streaming/agents/online_unit_decoder.py
+++ b/src/seamless_communication/streaming/agents/online_unit_decoder.py
@@ -20,7 +20,7 @@ from simuleval.agents.states import AgentStates
 
				 from simuleval.data.segments import Segment, TextSegment
			
 
				 
			
 
				 
			
 
				-class NARUnitDecoderAgentStates(AgentStates):
			
 
				+class NARUnitDecoderAgentStates(AgentStates):  # type: ignore
			
 
				     def reset(self) -> None:
			
 
				         self.source_token_list: List[str] = []
			
 
				         self.source_indices: Optional[torch.Tensor] = None
			
@@ -51,7 +51,7 @@ class NARUnitDecoderAgentStates(AgentStates):
 
				         self.source = content
			
 
				 
			
 
				 
			
 
				-class NARUnitYUnitDecoderAgent(GenericAgent):
			
 
				+class NARUnitYUnitDecoderAgent(GenericAgent):  # type: ignore
			
 
				     """Non-autoregressive unit decoder"""
			
 
				 
			
 
				     source_type = "text"
			
--- a/src/seamless_communication/streaming/agents/online_vocoder.py
+++ b/src/seamless_communication/streaming/agents/online_vocoder.py
@@ -14,7 +14,7 @@ from simuleval.agents.actions import ReadAction, WriteAction
 
				 from simuleval.data.segments import SpeechSegment
			
 
				 
			
 
				 
			
 
				-class VocoderAgent(TextToSpeechAgent):
			
 
				+class VocoderAgent(TextToSpeechAgent):  # type: ignore
			
 
				     def __init__(self, vocoder: Vocoder, args: Namespace) -> None:
			
 
				         super().__init__(args)
			
 
				         self.sample_rate = args.sample_rate
			
--- a/src/seamless_communication/streaming/agents/silero_vad.py
+++ b/src/seamless_communication/streaming/agents/silero_vad.py
@@ -34,7 +34,7 @@ logger = logging.getLogger(__name__)
 
				 SPEECH_PROB_THRESHOLD = 0.6
			
 
				 
			
 
				 
			
 
				-class SileroVADStates(EarlyStoppingMixin, AgentStates):
			
 
				+class SileroVADStates(EarlyStoppingMixin, AgentStates):  # type: ignore
			
 
				     def __init__(self, args: Namespace) -> None:
			
 
				         self.model, utils = torch.hub.load(
			
 
				             repo_or_dir="snakers4/silero-vad",
			
@@ -253,7 +253,7 @@ class SileroVADStates(EarlyStoppingMixin, AgentStates):
 
				                 )
			
 
				 
			
 
				 
			
 
				-class SileroVADAgent(SpeechToSpeechAgent):
			
 
				+class SileroVADAgent(SpeechToSpeechAgent):  # type: ignore
			
 
				     def __init__(self, args: Namespace) -> None:
			
 
				         super().__init__(args)
			
 
				         self.chunk_size_samples = args.chunk_size_samples
			
--- a/src/seamless_communication/streaming/agents/unity_pipeline.py
+++ b/src/seamless_communication/streaming/agents/unity_pipeline.py
@@ -57,7 +57,7 @@ class UnitYPipelineMixin:
 
				 
			
 
				     @classmethod
			
 
				     def add_args(cls, parser: ArgumentParser) -> None:
			
 
				-        super().add_args(parser)
			
 
				+        super().add_args(parser)  # type: ignore
			
 
				         parser.add_argument("--task", type=str, help="Task type")
			
 
				         parser.add_argument(
			
 
				             "--unity-model-name",
			
@@ -157,7 +157,7 @@ class UnitYPipelineMixin:
 
				         }
			
 
				 
			
 
				 
			
 
				-class UnitYAgentPipeline(UnitYPipelineMixin, AgentPipeline):
			
 
				+class UnitYAgentPipeline(UnitYPipelineMixin, AgentPipeline):  # type: ignore
			
 
				     pipeline: List[GenericAgent] = []
			
 
				 
			
 
				     def __init__(self, args: Namespace):
			
@@ -199,8 +199,8 @@ class UnitYAgentPipeline(UnitYPipelineMixin, AgentPipeline):
 
				         return cls(args)
			
 
				 
			
 
				 
			
 
				-class UnitYAgentTreePipeline(UnitYPipelineMixin, TreeAgentPipeline):
			
 
				-    pipeline = {}
			
 
				+class UnitYAgentTreePipeline(UnitYPipelineMixin, TreeAgentPipeline):  # type: ignore
			
 
				+    pipeline: Any = {}
			
 
				 
			
 
				     def __init__(self, args: Namespace):
			
 
				         models_and_configs = self.load_model(args)
			
@@ -231,10 +231,10 @@ class UnitYAgentTreePipeline(UnitYPipelineMixin, TreeAgentPipeline):
 
				             # An early stop.
			
 
				             # The temporary solution is to start over
			
 
				             if states is not None:
			
 
				-                maybe_reset_states(states.values())
			
 
				+                maybe_reset_states(states)
			
 
				             else:
			
 
				                 self.reset()
			
 
				             for segment in output_segment:
			
 
				                 segment.finished = False
			
 
				 
			
 
				-        return output_segment
			
 
				+        return output_segment  # type: ignore[no-any-return]
			
--- a/src/seamless_communication/streaming/dataloaders/s2tt.py
+++ b/src/seamless_communication/streaming/dataloaders/s2tt.py
@@ -38,7 +38,7 @@ def count_lines(filename: Path) -> int:
 
				 
			
 
				 
			
 
				 @register_dataloader("fairseq2_s2tt")
			
 
				-class SimulEvalSpeechToTextDataloader(SpeechToTextDataloader, IterableDataloader):
			
 
				+class SimulEvalSpeechToTextDataloader(SpeechToTextDataloader, IterableDataloader):  # type: ignore
			
 
				     def __init__(self, data_pipeline: DataPipeline, args: Namespace) -> None:
			
 
				         self.args = args
			
 
				         self.data_file: Path = Path(getattr(self.args, "data_file", ""))