2 years ago · 0d2c128b4a
--- a/setup.py
+++ b/setup.py
@@ -37,6 +37,7 @@ setup(
 
				             "m4t_finetune=seamless_communication.cli.m4t.finetune.finetune:main",
			
 
				             "m4t_prepare_dataset=seamless_communication.cli.m4t.finetune.dataset:main",
			
 
				             "m4t_audio_to_units=seamless_communication.cli.m4t.audio_to_units.audio_to_units:main",
			
 
				+            "streaming_evaluate=seamless_communication.cli.streaming.evaluate:main",
			
 
				         ],
			
 
				     },
			
 
				     include_package_data=True,
			
--- a/src/seamless_communication/cli/streaming/evaluate.py
+++ b/src/seamless_communication/cli/streaming/evaluate.py
@@ -4,45 +4,121 @@
 
				 # This source code is licensed under the license found in the
			
 
				 # LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				+import argparse
			
 
				+
			
 
				+from fairseq2.assets import asset_store, download_manager
			
 
				+from seamless_communication.cli.eval_utils import get_tokenizer
			
 
				 from seamless_communication.cli.streaming.scorers.seamless_whisper_asr_bleu import (
			
 
				     SeamlessWhisperASRSacreBLEUScorer as SeamlessWhisperASRSacreBLEUScorer,
			
 
				 )
			
 
				-from seamless_communication.streaming.agents.mma_m4t_s2st import MonotonicM4TS2STAgent
			
 
				-from simuleval.cli import evaluate
			
 
				+from seamless_communication.streaming.agents.mma_m4t_s2st import (
			
 
				+    MonotonicM4TS2STAgent,
			
 
				+    SeamlessS2STAgent,
			
 
				+)
			
 
				+from seamless_communication.streaming.agents.mma_m4t_s2t import MonotonicM4TS2TAgent
			
 
				+from simuleval.evaluator import build_evaluator
			
 
				+from simuleval.utils.agent import EVALUATION_SYSTEM_LIST, build_system_args
			
 
				 
			
 
				 
			
 
				-if __name__ == "__main__":
			
 
				-    tgt_lang = "eng"
			
 
				+def main() -> None:
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        add_help=False,
			
 
				+        description="Streaming evaluation of Seamless UnitY models",
			
 
				+        conflict_handler="resolve",
			
 
				+    )
			
 
				 
			
 
				-    data_configs = dict(
			
 
				-        dataloader="fairseq2_s2tt",
			
 
				-        dataloader_class="seamless_communication.streaming.dataloaders.s2tt.SimulEvalSpeechToTextDataloader",
			
 
				-        data_file="/large_experiments/seamless/ust/annaysun/datasets/s2ut_pt/x2t_v2/dev_fleurs_spa-eng.tsv",
			
 
				-        tgt_lang=tgt_lang,
			
 
				-        audio_root_dir="/large_experiments/seamless/ust/data/audio_zips",
			
 
				-        end_index=10,
			
 
				+    parser.add_argument(
			
 
				+        "--task",
			
 
				+        choices=["s2st", "s2tt"],
			
 
				+        required=True,
			
 
				+        type=str,
			
 
				+        help="Target language to translate/transcribe into.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--expressive",
			
 
				+        action="store_true",
			
 
				+        default=False,
			
 
				+        help="Expressive streaming S2ST inference",
			
 
				     )
			
 
				+    parser.add_argument(
			
 
				+        "--dtype",
			
 
				+        default="fp16",
			
 
				+        type=str,
			
 
				+    )
			
 
				+
			
 
				+    args, _ = parser.parse_known_args()
			
 
				 
			
 
				     model_configs = dict(
			
 
				-        agent_class="seamless_communication.streaming.agents.mma_m4t_s2st.MonotonicM4TS2STAgent",
			
 
				         source_segment_size=320,
			
 
				-        task="s2st",
			
 
				         device="cuda:0",
			
 
				-        dtype="fp16",
			
 
				+        dtype=args.dtype,
			
 
				         min_starting_wait_w2vbert=192,
			
 
				         decision_threshold=0.5,
			
 
				-        min_unit_chunk_size=50,
			
 
				         no_early_stop=True,
			
 
				-        max_len_a=0,
			
 
				-        max_len_b=100,
			
 
				+        max_len_a=1,
			
 
				+        max_len_b=200,
			
 
				     )
			
 
				 
			
 
				-    eval_configs = dict(
			
 
				-        output=f"MonotonicM4TS2STAgent_spa-eng_debug",
			
 
				-        quality_metrics="SEAMLESS_WHISPER_ASR_BLEU",
			
 
				-        latency_metrics="StartOffset EndOffset",
			
 
				-        whisper_model_size="large-v2",
			
 
				-        normalize_asr_bleu_references=True,
			
 
				+    if args.dtype == "fp16":
			
 
				+        model_configs.update(dict(fp16=True))
			
 
				+
			
 
				+    EVALUATION_SYSTEM_LIST.clear()
			
 
				+    if args.task == "s2st":
			
 
				+        model_configs.update(
			
 
				+            dict(
			
 
				+                min_unit_chunk_size=50,
			
 
				+            )
			
 
				+        )
			
 
				+        eval_configs = dict(
			
 
				+            quality_metrics="SEAMLESS_WHISPER_ASR_BLEU",
			
 
				+            latency_metrics="StartOffset EndOffset",
			
 
				+            whisper_model_size="large-v2",
			
 
				+            normalize_asr_bleu_references=True,
			
 
				+        )
			
 
				+        if args.expressive:
			
 
				+            EVALUATION_SYSTEM_LIST.append(SeamlessS2STAgent)
			
 
				+            model_configs.update(dict(vocoder_name="vocoder_pretssel"))
			
 
				+        else:
			
 
				+            EVALUATION_SYSTEM_LIST.append(MonotonicM4TS2STAgent)
			
 
				+    elif args.task == "s2tt":
			
 
				+        EVALUATION_SYSTEM_LIST.append(MonotonicM4TS2TAgent)
			
 
				+        parser.add_argument(
			
 
				+            "--unity-model-name",
			
 
				+            type=str,
			
 
				+            help="Unity model name.",
			
 
				+            default="seamless_streaming_unity",
			
 
				+        )
			
 
				+        parser.add_argument(
			
 
				+            "--tgt-lang",
			
 
				+            default="eng",
			
 
				+            type=str,
			
 
				+            help="Target language to translate/transcribe into.",
			
 
				+        )
			
 
				+        args, _ = parser.parse_known_args()
			
 
				+        asset_card = asset_store.retrieve_card(name=args.unity_model_name)
			
 
				+        tokenizer_uri = asset_card.field("tokenizer").as_uri()
			
 
				+        tokenizer_path = download_manager.download_tokenizer(
			
 
				+            tokenizer_uri, asset_card.name, force=False, progress=True
			
 
				+        )
			
 
				+        eval_configs = dict(
			
 
				+            sacrebleu_tokenizer=get_tokenizer(args.tgt_lang),
			
 
				+            eval_latency_unit="spm",
			
 
				+            eval_latency_spm_model=tokenizer_path,
			
 
				+            latency_metrics="AL LAAL",
			
 
				+        )
			
 
				+
			
 
				+    base_config = dict(
			
 
				+        dataloader="fairseq2_s2tt",
			
 
				+        dataloader_class="seamless_communication.streaming.dataloaders.s2tt.SimulEvalSpeechToTextDataloader",
			
 
				     )
			
 
				 
			
 
				-    evaluate(MonotonicM4TS2STAgent, {**data_configs, **model_configs, **eval_configs})
			
 
				+    system, args = build_system_args(
			
 
				+        {**base_config, **model_configs, **eval_configs}, parser
			
 
				+    )
			
 
				+
			
 
				+    evaluator = build_evaluator(args)
			
 
				+    evaluator(system)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/src/seamless_communication/cli/streaming/evaluate_pretssel_vocoder.py
+++ b/src/seamless_communication/cli/streaming/evaluate_pretssel_vocoder.py
@@ -1,49 +0,0 @@
 
				-# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				-# All rights reserved.
			
 
				-#
			
 
				-# This source code is licensed under the license found in the
			
 
				-# LICENSE file in the root directory of this source tree.
			
 
				-
			
 
				-from seamless_communication.cli.streaming.scorers.seamless_whisper_asr_bleu import (
			
 
				-    SeamlessWhisperASRSacreBLEUScorer as SeamlessWhisperASRSacreBLEUScorer,
			
 
				-)
			
 
				-from seamless_communication.streaming.agents.mma_m4t_s2st import SeamlessS2STAgent
			
 
				-from simuleval.cli import evaluate
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    tgt_lang = "eng"
			
 
				-
			
 
				-    data_configs = dict(
			
 
				-        dataloader="fairseq2_s2tt",
			
 
				-        dataloader_class="seamless_communication.streaming.dataloaders.s2tt.SimulEvalSpeechToTextDataloader",
			
 
				-        data_file="/large_experiments/seamless/ust/annaysun/datasets/s2ut_pt/x2t_v2/dev_fleurs_spa-eng.tsv",
			
 
				-        tgt_lang=tgt_lang,
			
 
				-        audio_root_dir="/large_experiments/seamless/ust/data/audio_zips",
			
 
				-        end_index=10,
			
 
				-    )
			
 
				-
			
 
				-    model_configs = dict(
			
 
				-        vocoder_name="vocoder_pretssel_16khz",
			
 
				-        agent_class="seamless_communication.streaming.agents.mma_m4t_s2st.SeamlessS2STAgent",
			
 
				-        source_segment_size=320,
			
 
				-        task="s2st",
			
 
				-        device="cuda:0",
			
 
				-        dtype="fp16",
			
 
				-        min_starting_wait_w2vbert=192,
			
 
				-        decision_threshold=0.5,
			
 
				-        min_unit_chunk_size=50,
			
 
				-        no_early_stop=True,
			
 
				-        max_len_a=0,
			
 
				-        max_len_b=100,
			
 
				-    )
			
 
				-
			
 
				-    eval_configs = dict(
			
 
				-        output=f"SeamlessS2STAgent_spa-eng_debug",
			
 
				-        quality_metrics="SEAMLESS_WHISPER_ASR_BLEU",
			
 
				-        latency_metrics="StartOffset EndOffset",
			
 
				-        whisper_model_size="large-v2",
			
 
				-        normalize_asr_bleu_references=True,
			
 
				-    )
			
 
				-
			
 
				-    evaluate(SeamlessS2STAgent, {**data_configs, **model_configs, **eval_configs})
			
--- a/src/seamless_communication/cli/streaming/scorers/seamless_whisper_asr_bleu.py
+++ b/src/seamless_communication/cli/streaming/scorers/seamless_whisper_asr_bleu.py
@@ -9,7 +9,7 @@ from argparse import ArgumentParser, Namespace
 
				 from typing import Dict, List
			
 
				 
			
 
				 from sacrebleu.metrics.bleu import BLEU
			
 
				-from seamless_communication.cli.eval_utils import get_tokenizer, LANG3_LANG2
			
 
				+from seamless_communication.cli.eval_utils import LANG3_LANG2, get_tokenizer
			
 
				 from simuleval.evaluator.instance import LogInstance
			
 
				 from simuleval.evaluator.scorers.quality_scorer import (
			
 
				     WhisperASRSacreBLEUScorer,
			
--- a/src/seamless_communication/streaming/agents/online_text_decoder.py
+++ b/src/seamless_communication/streaming/agents/online_text_decoder.py
@@ -125,8 +125,8 @@ class OnlineTextDecoderAgent(GenericAgent):  # type: ignore
 
				         )
			
 
				         parser.add_argument(
			
 
				             "--tgt-lang",
			
 
				+            default="eng",
			
 
				             type=str,
			
 
				-            default=None,
			
 
				         )
			
 
				 
			
 
				     def policy(self, states: DecoderAgentStates) -> Action:
			
--- a/src/seamless_communication/streaming/agents/pretssel_vocoder.py
+++ b/src/seamless_communication/streaming/agents/pretssel_vocoder.py
@@ -6,20 +6,20 @@
 
				 from __future__ import annotations
			
 
				 
			
 
				 from argparse import ArgumentParser, Namespace
			
 
				-import torch
			
 
				-from typing import Any, Dict
			
 
				+from typing import Any, Dict, List
			
 
				 
			
 
				-from fairseq2.data.audio import WaveformToFbankConverter
			
 
				+import torch
			
 
				+from fairseq2.data.audio import WaveformToFbankConverter, WaveformToFbankInput
			
 
				+from seamless_communication.models.generator.vocoder import PretsselVocoder
			
 
				 from seamless_communication.models.unity import load_gcmvn_stats
			
 
				 from seamless_communication.models.vocoder.vocoder import Vocoder
			
 
				-from seamless_communication.models.generator.vocoder import PretsselVocoder
			
 
				 from seamless_communication.streaming.agents.common import NoUpdateTargetMixin
			
 
				 from simuleval.agents import AgentStates, TextToSpeechAgent
			
 
				 from simuleval.agents.actions import ReadAction, WriteAction
			
 
				 from simuleval.data.segments import SpeechSegment
			
 
				 
			
 
				 
			
 
				-class PretsselVocoderAgent(NoUpdateTargetMixin, TextToSpeechAgent):
			
 
				+class PretsselVocoderAgent(NoUpdateTargetMixin, TextToSpeechAgent):  # type: ignore
			
 
				     def __init__(self, vocoder: Vocoder, args: Namespace) -> None:
			
 
				         super().__init__(args)
			
 
				         self.vocoder = vocoder
			
@@ -36,13 +36,15 @@ class PretsselVocoderAgent(NoUpdateTargetMixin, TextToSpeechAgent):
 
				             dtype=args.dtype,
			
 
				         )
			
 
				 
			
 
				-
			
 
				         _gcmvn_mean, _gcmvn_std = load_gcmvn_stats(args.vocoder_name)
			
 
				-        self.gcmvn_mean = torch.tensor(_gcmvn_mean, device=args.device, dtype=args.dtype)
			
 
				+        self.gcmvn_mean = torch.tensor(
			
 
				+            _gcmvn_mean, device=args.device, dtype=args.dtype
			
 
				+        )
			
 
				         self.gcmvn_std = torch.tensor(_gcmvn_std, device=args.device, dtype=args.dtype)
			
 
				 
			
 
				     def gcmvn_normalize(self, seqs: torch.Tensor) -> torch.Tensor:
			
 
				-        return seqs.subtract(self.gcmvn_mean).divide(self.gcmvn_std)
			
 
				+        result: torch.Tensor = seqs.subtract(self.gcmvn_mean).divide(self.gcmvn_std)
			
 
				+        return result
			
 
				 
			
 
				     @torch.inference_mode()
			
 
				     def policy(self, states: AgentStates) -> WriteAction:
			
@@ -66,15 +68,18 @@ class PretsselVocoderAgent(NoUpdateTargetMixin, TextToSpeechAgent):
 
				 
			
 
				         duration *= 2
			
 
				 
			
 
				-        if type(states.upstream_states[self.upstream_idx].source) == list:
			
 
				-            source = sum(states.upstream_states[self.upstream_idx].source, [])
			
 
				+        if isinstance(states.upstream_states[self.upstream_idx].source, list):
			
 
				+            source: List[float] = sum(
			
 
				+                states.upstream_states[self.upstream_idx].source, []
			
 
				+            )
			
 
				         else:
			
 
				             source = states.upstream_states[self.upstream_idx].source
			
 
				 
			
 
				-        audio_dict = {
			
 
				-            "waveform": torch.tensor(source, dtype=torch.float32, device=self.device).unsqueeze(1),
			
 
				+        audio_dict: WaveformToFbankInput = {
			
 
				+            "waveform": torch.tensor(
			
 
				+                source, dtype=torch.float32, device=self.device
			
 
				+            ).unsqueeze(1),
			
 
				             "sample_rate": self.sample_rate,
			
 
				-            "format": -1,
			
 
				         }
			
 
				 
			
 
				         feats = self.convert_to_fbank(audio_dict)["fbank"]
			
@@ -115,11 +120,13 @@ class PretsselVocoderAgent(NoUpdateTargetMixin, TextToSpeechAgent):
 
				             "--vocoder-sample-rate",
			
 
				             type=int,
			
 
				             default=16000,
			
 
				-            help="sample rate out of the vocoder"
			
 
				+            help="sample rate out of the vocoder",
			
 
				         )
			
 
				 
			
 
				     @classmethod
			
 
				-    def from_args(cls, args: Namespace, **kwargs: Dict[str, Any]) -> PretsselVocoderAgent:
			
 
				+    def from_args(
			
 
				+        cls, args: Namespace, **kwargs: Dict[str, Any]
			
 
				+    ) -> PretsselVocoderAgent:
			
 
				         vocoder = kwargs.get("vocoder", None)
			
 
				         assert isinstance(vocoder, PretsselVocoder)
			
 
				         return cls(vocoder, args)
			
--- a/src/seamless_communication/streaming/agents/unity_pipeline.py
+++ b/src/seamless_communication/streaming/agents/unity_pipeline.py
@@ -7,11 +7,13 @@ from __future__ import annotations
 
				 
			
 
				 import logging
			
 
				 from argparse import ArgumentParser, Namespace
			
 
				-from typing import Any, Dict, List, Optional
			
 
				+from typing import Any, Dict, List, Optional, Union
			
 
				 
			
 
				 import torch
			
 
				 from fairseq2.assets import asset_store
			
 
				 from seamless_communication.inference.translator import Modality, Translator
			
 
				+from seamless_communication.models.generator.loader import load_pretssel_vocoder_model
			
 
				+from seamless_communication.models.generator.vocoder import PretsselVocoder
			
 
				 from seamless_communication.models.monotonic_decoder import (
			
 
				     load_monotonic_decoder_config,
			
 
				     load_monotonic_decoder_model,
			
@@ -23,7 +25,7 @@ from seamless_communication.models.unity import (
 
				     load_unity_unit_tokenizer,
			
 
				 )
			
 
				 from seamless_communication.models.vocoder.loader import load_vocoder_model
			
 
				-from seamless_communication.models.generator.loader import load_pretssel_vocoder_model
			
 
				+from seamless_communication.models.vocoder.vocoder import Vocoder
			
 
				 from seamless_communication.streaming.agents.common import (
			
 
				     AgentStates,
			
 
				     EarlyStoppingMixin,
			
@@ -85,8 +87,13 @@ class UnitYPipelineMixin:
 
				         )
			
 
				         parser.add_argument(
			
 
				             "--dtype",
			
 
				+            choices=["fp16", "fp32"],
			
 
				             default="fp16",
			
 
				             type=str,
			
 
				+            help=(
			
 
				+                "Choose between half-precision (fp16) and single precision (fp32) floating point formats."
			
 
				+                + " Prefer this over the fp16 flag."
			
 
				+            ),
			
 
				         )
			
 
				 
			
 
				     @classmethod
			
@@ -140,8 +147,11 @@ class UnitYPipelineMixin:
 
				         )
			
 
				         monotonic_decoder_model.eval()
			
 
				 
			
 
				-        vocoder = None
			
 
				+        vocoder: Optional[Union[PretsselVocoder, Vocoder]] = None
			
 
				         if args.vocoder_name is not None and output_modality == Modality.SPEECH:
			
 
				+            logger.info(
			
 
				+                f"Loading the Vocoder model: {args.vocoder_name} on device={args.device}, dtype={args.dtype}"
			
 
				+            )
			
 
				             if "pretssel" in args.vocoder_name:
			
 
				                 vocoder = load_pretssel_vocoder_model(
			
 
				                     args.vocoder_name, device=args.device, dtype=args.dtype
			
@@ -150,7 +160,7 @@ class UnitYPipelineMixin:
 
				                 vocoder = load_vocoder_model(
			
 
				                     args.vocoder_name, device=args.device, dtype=args.dtype
			
 
				                 )
			
 
				-
			
 
				+            assert vocoder is not None
			
 
				             vocoder.eval()
			
 
				 
			
 
				         return {
			
--- a/src/seamless_communication/streaming/dataloaders/s2tt.py
+++ b/src/seamless_communication/streaming/dataloaders/s2tt.py
@@ -210,7 +210,16 @@ class SimulEvalSpeechToTextDataloader(SpeechToTextDataloader, IterableDataloader
 
				             help="Source segment size, For text the unit is # token, for speech is ms",
			
 
				         )
			
 
				         parser.add_argument(
			
 
				-            "--tgt-lang", type=str, help="Target language to translate/transcribe into."
			
 
				+            "--tgt-lang",
			
 
				+            default="eng",
			
 
				+            type=str,
			
 
				+            help="Target language to translate/transcribe into.",
			
 
				+        )
			
 
				+        parser.add_argument(
			
 
				+            "--output",
			
 
				+            type=str,
			
 
				+            required=True,
			
 
				+            help="Output directory. Required if using iterable dataloader.",
			
 
				         )
			
 
				         parser.add_argument(
			
 
				             "--strip-silence",