1 年間前 · 732d7bd5a5
--- a/src/seamless_communication/cli/eval_utils/compute_metrics.py
+++ b/src/seamless_communication/cli/eval_utils/compute_metrics.py
@@ -4,23 +4,23 @@
 
															 # This source code is licensed under the license found in the
														
 
															 # LICENSE file in the root directory of this source tree.
														
 
															+import json
														
 
															 import logging
														
 
															+from pathlib import Path
														
 
															+from typing import Optional, Tuple, Union
														
 
															+
														
 
															 import pandas as pd
														
 
															 import whisper
														
 
															-
														
 
															 from fairseq2.typing import Device
														
 
															 from jiwer import cer, wer
														
 
															-from pathlib import Path
														
 
															 from sacrebleu.metrics.base import Score, Signature
														
 
															 from sacrebleu.metrics.bleu import BLEU
														
 
															 from sacrebleu.metrics.chrf import CHRF
														
 
															 from seamless_communication.cli.eval_utils.lang_mapping import LANG3_LANG2
														
 
															 from tqdm import tqdm
														
 
															-from typing import Optional, Tuple, Union
														
 
															 from whisper import Whisper
														
 
															 from whisper.normalizers import BasicTextNormalizer, EnglishTextNormalizer
														
 
															-
														
 
															 logging.basicConfig(
														
 
															     level=logging.INFO,
														
 
															     format="%(asctime)s %(levelname)s -- %(name)s: %(message)s",
														
@@ -190,7 +190,7 @@ def compute_asr_error_rate(
 
															     ref_text_series: pd.Series,
														
 
															     lang: str,
														
 
															     whisper_normalize_text: bool = True,
														
 
															-) -> Tuple[Score, str]:
														
 
															+) -> Tuple[float, str]:
														
 
															     """Wraps normalization functions and computes ASR WER/CER score
														
 
															     Args:
														
 
															         hyp_text_series (pd.Series): each line contains s2t model prediction or first pass prediction
														
@@ -348,17 +348,24 @@ def compute_quality_metrics(
 
															         logger.info(f"{task} ASR Normalized BLEU:\n{asr_bleu_normalized_json}")
														
 
															     if task == "ASR":
														
 
															-        _, asr_error_rate_signature = compute_asr_error_rate(
														
 
															+        asr_error_rate, asr_error_rate_signature = compute_asr_error_rate(
														
 
															             hyp_text_series=df[pred_text_col_name],
														
 
															             ref_text_series=df[ref_text_col_name],
														
 
															             lang=tgt_lang,
														
 
															             whisper_normalize_text=whisper_normalize_text_output,
														
 
															         )
														
 
															+        d = {
														
 
															+            "name": "WER",
														
 
															+            "score": asr_error_rate,
														
 
															+            "signature": asr_error_rate_signature,
														
 
															+        }
														
 
															+        asr_error_rate_json = json.dumps(d, indent=1, ensure_ascii=False)
														
 
															+
														
 
															         filename = "asr_error_rate.json"
														
 
															         with open(output_path / filename, "w") as f:
														
 
															-            f.write(asr_error_rate_signature)
														
 
															+            f.write(asr_error_rate_json)
														
 
															-        logger.info(f"ASR : {asr_error_rate_signature}")
														
 
															+        logger.info(f"ASR : {asr_error_rate_json}")
														
 
															     return filename
														
--- a/src/seamless_communication/cli/streaming/README.md
+++ b/src/seamless_communication/cli/streaming/README.md
@@ -0,0 +1,45 @@
 
															+# Evaluating SeamlessStreaming and Seamless models
														
 
															+SeamlessStreaming is the streaming only model and Seamless is the expressive streaming model.
														
 
															+
														
 
															+## Quick start:
														
 
															+
														
 
															+Evaluation can be run with the `streaming_evaluate` CLI.
														
 
															+
														
 
															+We use the `seamless_streaming_unity` for loading the speech encoder and T2U models, and `seamless_streaming_monotonic_decoder` for loading the text decoder for streaming evaluation. This is already set as defaults for the `streaming_evaluate` CLI, but can be overridden using the `--unity-model-name` and  `--monotonic-decoder-model-name` args if required.
														
 
															+
														
 
															+Note that the numbers in the paper use single precision floating point format (fp32) for evaluation by setting `--dtype fp32`.
														
 
															+
														
 
															+### S2TT:
														
 
															+Set the task to `s2tt` for evaluating the speech-to-text translation part of the SeamlessStreaming model.
														
 
															+
														
 
															+```bash
														
 
															+streaming_evaluate --task s2tt --data-file <path_to_data_tsv_file> --audio-root-dir <path_to_audio_root_directory> --output <path_to_evaluation_output_directory> --tgt-lang <3_letter_lang_code>
														
 
															+```
														
 
															+
														
 
															+Note: The `--ref-field` can be used to specify the name of the reference column in the dataset.
														
 
															+
														
 
															+### ASR:
														
 
															+Set the task to `asr` for evaluating the automatic speech recognition part of the SeamlessStreaming model. Make sure to pass the source language as the `--tgt-lang` arg.
														
 
															+
														
 
															+```bash
														
 
															+streaming_evaluate --task s2tt --data-file <path_to_data_tsv_file> --audio-root-dir <path_to_audio_root_directory> --output <path_to_evaluation_output_directory> --tgt-lang <3_letter_source_lang_code> 
														
 
															+```
														
 
															+
														
 
															+### S2ST:
														
 
															+
														
 
															+#### SeamlessStreaming:
														
 
															+
														
 
															+Set the task to `s2st` for evaluating the speech-to-speech translation part of the SeamlessStreaming model. 
														
 
															+
														
 
															+```bash
														
 
															+streaming_evaluate --task s2st --data-file <path_to_data_tsv_file> --audio-root-dir <path_to_audio_root_directory> --output <path_to_evaluation_output_directory> --tgt-lang <3_letter_lang_code>
														
 
															+```
														
 
															+
														
 
															+#### Seamless:
														
 
															+The Seamless model is an unified model for streaming expressive speech-to-speech tranlsation. Use the `--expressive` arg for running evaluation of this unified model.
														
 
															+
														
 
															+```bash
														
 
															+streaming_evaluate --task s2st --data-file <path_to_data_tsv_file> --audio-root-dir <path_to_audio_root_directory> --output <path_to_evaluation_output_directory> --tgt-lang <3_letter_lang_code> --expressive
														
 
															+```
														
 
															+
														
 
															+Note: In the current version of our paper, we use vocoder_pretssel_16khz for the evaluation , so in order to reproduce those results please add this arg to the above command: `--vocoder-name vocoder_pretssel_16khz`
														
--- a/src/seamless_communication/cli/streaming/evaluate.py
+++ b/src/seamless_communication/cli/streaming/evaluate.py
@@ -5,19 +5,29 @@
 
															 # LICENSE file in the root directory of this source tree.
														
 
															 import argparse
														
 
															+import logging
														
 
															 from fairseq2.assets import asset_store, download_manager
														
 
															-from seamless_communication.streaming.agents.mma_m4t_s2st import (
														
 
															-    MonotonicM4TS2STAgent,
														
 
															-    SeamlessS2STAgent,
														
 
															-)
														
 
															 from seamless_communication.cli.streaming.scorers.seamless_quality_scorer import (
														
 
															     SeamlessQualityScorer,
														
 
															 )
														
 
															-from seamless_communication.streaming.agents.mma_m4t_s2t import MonotonicM4TS2TAgent
														
 
															+from seamless_communication.streaming.agents.seamless_s2st import SeamlessS2STAgent
														
 
															+from seamless_communication.streaming.agents.seamless_streaming_s2st import (
														
 
															+    SeamlessStreamingS2STAgent,
														
 
															+)
														
 
															+from seamless_communication.streaming.agents.seamless_streaming_s2t import (
														
 
															+    SeamlessStreamingS2TAgent,
														
 
															+)
														
 
															 from simuleval.evaluator import build_evaluator
														
 
															 from simuleval.utils.agent import EVALUATION_SYSTEM_LIST, build_system_args
														
 
															+logging.basicConfig(
														
 
															+    level=logging.INFO,
														
 
															+    format="%(asctime)s %(levelname)s -- %(name)s: %(message)s",
														
 
															+)
														
 
															+
														
 
															+logger = logging.getLogger(__name__)
														
 
															+
														
 
															 def main() -> None:
														
 
															     parser = argparse.ArgumentParser(
														
@@ -28,7 +38,7 @@ def main() -> None:
 
															     parser.add_argument(
														
 
															         "--task",
														
 
															-        choices=["s2st", "s2tt"],
														
 
															+        choices=["s2st", "s2tt", "asr"],
														
 
															         required=True,
														
 
															         type=str,
														
 
															         help="Target language to translate/transcribe into.",
														
@@ -39,46 +49,33 @@ def main() -> None:
 
															         default=False,
														
 
															         help="Expressive streaming S2ST inference",
														
 
															     )
														
 
															-    parser.add_argument(
														
 
															-        "--dtype",
														
 
															-        default="fp16",
														
 
															-        type=str,
														
 
															-    )
														
 
															     args, _ = parser.parse_known_args()
														
 
															     model_configs = dict(
														
 
															         source_segment_size=320,
														
 
															         device="cuda:0",
														
 
															-        dtype=args.dtype,
														
 
															+        dtype="fp16",
														
 
															         min_starting_wait_w2vbert=192,
														
 
															         decision_threshold=0.5,
														
 
															         no_early_stop=True,
														
 
															-        max_len_a=1,
														
 
															-        max_len_b=200,
														
 
															+        max_len_a=0,
														
 
															+        max_len_b=100,
														
 
															     )
														
 
															-    if args.dtype == "fp16":
														
 
															-        model_configs.update(dict(fp16=True))
														
 
															-
														
 
															     EVALUATION_SYSTEM_LIST.clear()
														
 
															     eval_configs = dict(quality_metrics="SEAMLESS_QUALITY_SCORER")
														
 
															     if args.task == "s2st":
														
 
															-        model_configs.update(
														
 
															-            dict(
														
 
															-                min_unit_chunk_size=50,
														
 
															-            )
														
 
															-        )
														
 
															+        model_configs["min_unit_chunk_size"] = 50
														
 
															         eval_configs["latency_metrics"] = "StartOffset EndOffset"
														
 
															         if args.expressive:
														
 
															             EVALUATION_SYSTEM_LIST.append(SeamlessS2STAgent)
														
 
															-            model_configs.update(dict(vocoder_name="vocoder_pretssel"))
														
 
															         else:
														
 
															-            EVALUATION_SYSTEM_LIST.append(MonotonicM4TS2STAgent)
														
 
															-    elif args.task == "s2tt":
														
 
															+            EVALUATION_SYSTEM_LIST.append(SeamlessStreamingS2STAgent)
														
 
															+    elif args.task in ["s2tt", "asr"]:
														
 
															         assert args.expressive is False, "S2TT inference cannot be expressive."
														
 
															-        EVALUATION_SYSTEM_LIST.append(MonotonicM4TS2TAgent)
														
 
															+        EVALUATION_SYSTEM_LIST.append(SeamlessStreamingS2TAgent)
														
 
															         parser.add_argument(
														
 
															             "--unity-model-name",
														
 
															             type=str,
														
@@ -104,6 +101,9 @@ def main() -> None:
 
															         {**base_config, **model_configs, **eval_configs}, parser
														
 
															     )
														
 
															+    if args.fp16:
														
 
															+        logger.warn("--fp16 arg will be ignorned, use --dtype instead")
														
 
															+
														
 
															     evaluator = build_evaluator(args)
														
 
															     evaluator(system)
														
--- a/src/seamless_communication/cli/streaming/scorers/seamless_quality_scorer.py
+++ b/src/seamless_communication/cli/streaming/scorers/seamless_quality_scorer.py
@@ -5,23 +5,19 @@
 
															 # LICENSE file in the root directory of this source tree.
														
 
															 from __future__ import annotations
														
 
															-import pandas
														
 
															-from fairseq2.typing import Device
														
 
															-from pathlib import Path
														
 
															-from typing import Optional
														
 
															+
														
 
															 import json
														
 
															 from argparse import ArgumentParser, Namespace
														
 
															-from typing import Dict
														
 
															+from pathlib import Path
														
 
															+from typing import Dict, Optional
														
 
															+import pandas
														
 
															+from fairseq2.typing import Device
														
 
															+from seamless_communication.cli.eval_utils import compute_quality_metrics
														
 
															+from simuleval.evaluator.instance import LogInstance
														
 
															 from simuleval.evaluator.scorers.quality_scorer import (
														
 
															-    register_quality_scorer,
														
 
															     QualityScorer,
														
 
															-)
														
 
															-
														
 
															-from simuleval.evaluator.instance import LogInstance
														
 
															-
														
 
															-from seamless_communication.cli.eval_utils import (
														
 
															-    compute_quality_metrics,
														
 
															+    register_quality_scorer,
														
 
															 )
														
@@ -90,19 +86,13 @@ class SeamlessQualityScorer(QualityScorer):  # type: ignore
 
															     @staticmethod
														
 
															     def add_args(parser: ArgumentParser) -> None:
														
 
															-        try:
														
 
															-            parser.add_argument(
														
 
															-                "--task", type=str, help="Task to evaluate", required=True
														
 
															-            )
														
 
															-            parser.add_argument(
														
 
															-                "--tgt-lang",
														
 
															-                type=str,
														
 
															-                help="Target language to translate/transcribe into.",
														
 
															-                required=True,
														
 
															-            )
														
 
															-        except:
														
 
															-            pass
														
 
															-
														
 
															+        parser.add_argument("--task", type=str, help="Task to evaluate", required=True)
														
 
															+        parser.add_argument(
														
 
															+            "--tgt-lang",
														
 
															+            type=str,
														
 
															+            help="Target language to translate/transcribe into.",
														
 
															+            required=True,
														
 
															+        )
														
 
															         parser.add_argument(
														
 
															             "--whisper-model-name", type=str, help="Whisper model name", default="large"
														
 
															         )
														
--- a/src/seamless_communication/streaming/agents/online_vocoder.py
+++ b/src/seamless_communication/streaming/agents/online_vocoder.py
@@ -5,21 +5,37 @@
 
															 # LICENSE file in the root directory of this source tree.
														
 
															 from __future__ import annotations
														
 
															+import logging
														
 
															 from argparse import ArgumentParser, Namespace
														
 
															 from typing import Any, Dict
														
 
															-import torch
														
 
															-from seamless_communication.models.vocoder.vocoder import Vocoder
														
 
															+import torch
														
 
															+from seamless_communication.models.vocoder.loader import load_vocoder_model
														
 
															 from simuleval.agents import AgentStates, TextToSpeechAgent
														
 
															 from simuleval.agents.actions import ReadAction, WriteAction
														
 
															 from simuleval.data.segments import SpeechSegment
														
 
															+logging.basicConfig(
														
 
															+    level=logging.INFO,
														
 
															+    format="%(asctime)s %(levelname)s -- %(name)s: %(message)s",
														
 
															+)
														
 
															+
														
 
															+logger = logging.getLogger(__name__)
														
 
															+
														
 
															 class VocoderAgent(TextToSpeechAgent):  # type: ignore
														
 
															-    def __init__(self, vocoder: Vocoder, args: Namespace) -> None:
														
 
															+    def __init__(self, args: Namespace) -> None:
														
 
															         super().__init__(args)
														
 
															+
														
 
															+        logger.info(
														
 
															+            f"Loading the Vocoder model: {args.vocoder_name} on device={args.device}, dtype={args.dtype}"
														
 
															+        )
														
 
															+        self.vocoder = load_vocoder_model(
														
 
															+            args.vocoder_name, device=args.device, dtype=args.dtype
														
 
															+        )
														
 
															+        self.vocoder.eval()
														
 
															+
														
 
															         self.sample_rate = args.sample_rate
														
 
															-        self.vocoder = vocoder
														
 
															         self.tgt_lang = args.tgt_lang
														
 
															         self.speaker_id = args.vocoder_speaker_id
														
@@ -54,6 +70,12 @@ class VocoderAgent(TextToSpeechAgent):  # type: ignore
 
															     @classmethod
														
 
															     def add_args(cls, parser: ArgumentParser) -> None:
														
 
															+        parser.add_argument(
														
 
															+            "--vocoder-name",
														
 
															+            type=str,
														
 
															+            help="Vocoder name.",
														
 
															+            default="vocoder_v2",
														
 
															+        )
														
 
															         parser.add_argument(
														
 
															             "--vocoder-speaker-id",
														
 
															             type=int,
														
@@ -64,6 +86,4 @@ class VocoderAgent(TextToSpeechAgent):  # type: ignore
 
															     @classmethod
														
 
															     def from_args(cls, args: Namespace, **kwargs: Dict[str, Any]) -> VocoderAgent:
														
 
															-        vocoder = kwargs.get("vocoder", None)
														
 
															-        assert isinstance(vocoder, Vocoder)
														
 
															-        return cls(vocoder, args)
														
 
															+        return cls(args)
														
--- a/src/seamless_communication/streaming/agents/pretssel_vocoder.py
+++ b/src/seamless_communication/streaming/agents/pretssel_vocoder.py
@@ -5,27 +5,46 @@
 
															 # LICENSE file in the root directory of this source tree.
														
 
															 from __future__ import annotations
														
 
															+import logging
														
 
															 from argparse import ArgumentParser, Namespace
														
 
															 from typing import Any, Dict, List
														
 
															 import torch
														
 
															+from fairseq2.assets import asset_store
														
 
															 from fairseq2.data.audio import WaveformToFbankConverter, WaveformToFbankInput
														
 
															-from seamless_communication.models.generator.vocoder import PretsselVocoder
														
 
															+from seamless_communication.models.generator.loader import load_pretssel_vocoder_model
														
 
															 from seamless_communication.models.unity import load_gcmvn_stats
														
 
															-from seamless_communication.models.vocoder.vocoder import Vocoder
														
 
															 from seamless_communication.streaming.agents.common import NoUpdateTargetMixin
														
 
															 from simuleval.agents import AgentStates, TextToSpeechAgent
														
 
															 from simuleval.agents.actions import ReadAction, WriteAction
														
 
															 from simuleval.data.segments import SpeechSegment
														
 
															+logging.basicConfig(
														
 
															+    level=logging.INFO,
														
 
															+    format="%(asctime)s %(levelname)s -- %(name)s: %(message)s",
														
 
															+)
														
 
															+
														
 
															+logger = logging.getLogger(__name__)
														
 
															+
														
 
															 class PretsselVocoderAgent(NoUpdateTargetMixin, TextToSpeechAgent):  # type: ignore
														
 
															-    def __init__(self, vocoder: Vocoder, args: Namespace) -> None:
														
 
															+    def __init__(self, args: Namespace) -> None:
														
 
															         super().__init__(args)
														
 
															-        self.vocoder = vocoder
														
 
															+
														
 
															+        logger.info(
														
 
															+            f"Loading the Vocoder model: {args.vocoder_name} on device={args.device}, dtype={args.dtype}"
														
 
															+        )
														
 
															+        assert "pretssel" in args.vocoder_name
														
 
															+        self.vocoder = load_pretssel_vocoder_model(
														
 
															+            args.vocoder_name, device=args.device, dtype=args.dtype
														
 
															+        )
														
 
															+        self.vocoder.eval()
														
 
															+
														
 
															+        vocoder_model_card = asset_store.retrieve_card(args.vocoder_name)
														
 
															+        self.vocoder_sample_rate = vocoder_model_card.field("sample_rate").as_(int)
														
 
															+
														
 
															         self.upstream_idx = args.upstream_idx
														
 
															         self.sample_rate = args.sample_rate  # input sample rate
														
 
															-        self.vocoder_sample_rate = args.vocoder_sample_rate  # output sample rate
														
 
															         self.tgt_lang = args.tgt_lang
														
 
															         self.convert_to_fbank = WaveformToFbankConverter(
														
 
															             num_mel_bins=80,
														
@@ -110,23 +129,21 @@ class PretsselVocoderAgent(NoUpdateTargetMixin, TextToSpeechAgent):  # type: ign
 
															     @classmethod
														
 
															     def add_args(cls, parser: ArgumentParser) -> None:
														
 
															+        parser.add_argument(
														
 
															+            "--vocoder-name",
														
 
															+            type=str,
														
 
															+            help="Vocoder name.",
														
 
															+            default="vocoder_pretssel",
														
 
															+        )
														
 
															         parser.add_argument(
														
 
															             "--upstream-idx",
														
 
															             type=int,
														
 
															             default=0,
														
 
															             help="index of encoder states where states.source contains input audio",
														
 
															         )
														
 
															-        parser.add_argument(
														
 
															-            "--vocoder-sample-rate",
														
 
															-            type=int,
														
 
															-            default=16000,
														
 
															-            help="sample rate out of the vocoder",
														
 
															-        )
														
 
															     @classmethod
														
 
															     def from_args(
														
 
															         cls, args: Namespace, **kwargs: Dict[str, Any]
														
 
															     ) -> PretsselVocoderAgent:
														
 
															-        vocoder = kwargs.get("vocoder", None)
														
 
															-        assert isinstance(vocoder, PretsselVocoder)
														
 
															-        return cls(vocoder, args)
														
 
															+        return cls(args)
														
--- a/src/seamless_communication/streaming/agents/seamless_streaming_s2st.py
+++ b/src/seamless_communication/streaming/agents/seamless_streaming_s2st.py
@@ -4,6 +4,7 @@
 
															 # This source code is licensed under the license found in the
														
 
															 # LICENSE file in the root directory of this source tree.
														
 
															+from seamless_communication.streaming.agents.detokenizer import UnitYDetokenizerAgent
														
 
															 from seamless_communication.streaming.agents.offline_w2v_bert_encoder import (
														
 
															     OfflineWav2VecBertEncoderAgent,
														
 
															 )
														
@@ -16,19 +17,15 @@ from seamless_communication.streaming.agents.online_text_decoder import (
 
															 from seamless_communication.streaming.agents.online_unit_decoder import (
														
 
															     NARUnitYUnitDecoderAgent,
														
 
															 )
														
 
															-from seamless_communication.streaming.agents.silero_vad import SileroVADAgent
														
 
															 from seamless_communication.streaming.agents.online_vocoder import VocoderAgent
														
 
															-from seamless_communication.streaming.agents.pretssel_vocoder import PretsselVocoderAgent
														
 
															-
														
 
															-from seamless_communication.streaming.agents.detokenizer import UnitYDetokenizerAgent
														
 
															+from seamless_communication.streaming.agents.silero_vad import SileroVADAgent
														
 
															 from seamless_communication.streaming.agents.unity_pipeline import (
														
 
															     UnitYAgentPipeline,
														
 
															     UnitYAgentTreePipeline,
														
 
															 )
														
 
															-from simuleval.utils import entrypoint
														
 
															-class MonotonicM4TS2STAgent(UnitYAgentPipeline):
														
 
															+class SeamlessStreamingS2STAgent(UnitYAgentPipeline):
														
 
															     pipeline = [
														
 
															         OnlineFeatureExtractorAgent,
														
 
															         OfflineWav2VecBertEncoderAgent,
														
@@ -38,17 +35,7 @@ class MonotonicM4TS2STAgent(UnitYAgentPipeline):
 
															     ]
														
 
															-class SeamlessS2STAgent(UnitYAgentPipeline):
														
 
															-    pipeline = [
														
 
															-        OnlineFeatureExtractorAgent,
														
 
															-        OfflineWav2VecBertEncoderAgent,
														
 
															-        UnitYMMATextDecoderAgent,
														
 
															-        NARUnitYUnitDecoderAgent,
														
 
															-        PretsselVocoderAgent,
														
 
															-    ]
														
 
															-
														
 
															-
														
 
															-class MonotonicM4TS2STVADAgent(UnitYAgentPipeline):
														
 
															+class SeamlessStreamingS2STVADAgent(UnitYAgentPipeline):
														
 
															     pipeline = [
														
 
															         SileroVADAgent,
														
 
															         OnlineFeatureExtractorAgent,
														
@@ -59,7 +46,7 @@ class MonotonicM4TS2STVADAgent(UnitYAgentPipeline):
 
															     ]
														
 
															-class MonotonicM4TS2STJointVADAgent(UnitYAgentTreePipeline):
														
 
															+class SeamlessStreamingS2STJointVADAgent(UnitYAgentTreePipeline):
														
 
															     pipeline = {
														
 
															         SileroVADAgent: [OnlineFeatureExtractorAgent],
														
 
															         OnlineFeatureExtractorAgent: [OfflineWav2VecBertEncoderAgent],
														
@@ -69,15 +56,3 @@ class MonotonicM4TS2STJointVADAgent(UnitYAgentTreePipeline):
 
															         NARUnitYUnitDecoderAgent: [VocoderAgent],
														
 
															         VocoderAgent: [],
														
 
															     }
														
 
															-
														
 
															-
														
 
															-class SeamlessS2STJointVADAgent(UnitYAgentTreePipeline):
														
 
															-    pipeline = {
														
 
															-        SileroVADAgent: [OnlineFeatureExtractorAgent],
														
 
															-        OnlineFeatureExtractorAgent: [OfflineWav2VecBertEncoderAgent],
														
 
															-        OfflineWav2VecBertEncoderAgent: [UnitYMMATextDecoderAgent],
														
 
															-        UnitYMMATextDecoderAgent: [UnitYDetokenizerAgent, NARUnitYUnitDecoderAgent],
														
 
															-        UnitYDetokenizerAgent: [],
														
 
															-        NARUnitYUnitDecoderAgent: [PretsselVocoderAgent],
														
 
															-        PretsselVocoderAgent: [],
														
 
															-    }
														
--- a/src/seamless_communication/streaming/agents/seamless_streaming_s2t.py
+++ b/src/seamless_communication/streaming/agents/seamless_streaming_s2t.py
@@ -16,11 +16,9 @@ from seamless_communication.streaming.agents.online_text_decoder import (
 
															 )
														
 
															 from seamless_communication.streaming.agents.silero_vad import SileroVADAgent
														
 
															 from seamless_communication.streaming.agents.unity_pipeline import UnitYAgentPipeline
														
 
															-from simuleval.utils import entrypoint
														
 
															-@entrypoint
														
 
															-class MonotonicM4TS2TDetokAgent(UnitYAgentPipeline):
														
 
															+class SeamlessStreamingS2TDetokAgent(UnitYAgentPipeline):
														
 
															     pipeline = [
														
 
															         OnlineFeatureExtractorAgent,
														
 
															         OfflineWav2VecBertEncoderAgent,
														
@@ -29,8 +27,7 @@ class MonotonicM4TS2TDetokAgent(UnitYAgentPipeline):
 
															     ]
														
 
															-@entrypoint
														
 
															-class MonotonicM4TS2TAgent(UnitYAgentPipeline):
														
 
															+class SeamlessStreamingS2TAgent(UnitYAgentPipeline):
														
 
															     pipeline = [
														
 
															         OnlineFeatureExtractorAgent,
														
 
															         OfflineWav2VecBertEncoderAgent,
														
@@ -38,7 +35,7 @@ class MonotonicM4TS2TAgent(UnitYAgentPipeline):
 
															     ]
														
 
															-class MonotonicM4TS2TVADAgent(UnitYAgentPipeline):
														
 
															+class SeamlessStreamingS2TVADAgent(UnitYAgentPipeline):
														
 
															     pipeline = [
														
 
															         SileroVADAgent,
														
 
															         OnlineFeatureExtractorAgent,
														
--- a/src/seamless_communication/streaming/agents/unity_pipeline.py
+++ b/src/seamless_communication/streaming/agents/unity_pipeline.py
@@ -74,12 +74,7 @@ class UnitYPipelineMixin:
 
															             help="Monotonic decoder model name.",
														
 
															             default="seamless_streaming_monotonic_decoder",
														
 
															         )
														
 
															-        parser.add_argument(
														
 
															-            "--vocoder-name",
														
 
															-            type=str,
														
 
															-            help="Vocoder name.",
														
 
															-            default="vocoder_v2",
														
 
															-        )
														
 
															+
														
 
															         parser.add_argument(
														
 
															             "--sample-rate",
														
 
															             default=16000,
														
@@ -147,22 +142,6 @@ class UnitYPipelineMixin:
 
															         )
														
 
															         monotonic_decoder_model.eval()
														
 
															-        vocoder: Optional[Union[PretsselVocoder, Vocoder]] = None
														
 
															-        if args.vocoder_name is not None and output_modality == Modality.SPEECH:
														
 
															-            logger.info(
														
 
															-                f"Loading the Vocoder model: {args.vocoder_name} on device={args.device}, dtype={args.dtype}"
														
 
															-            )
														
 
															-            if "pretssel" in args.vocoder_name:
														
 
															-                vocoder = load_pretssel_vocoder_model(
														
 
															-                    args.vocoder_name, device=args.device, dtype=args.dtype
														
 
															-                )
														
 
															-            else:
														
 
															-                vocoder = load_vocoder_model(
														
 
															-                    args.vocoder_name, device=args.device, dtype=args.dtype
														
 
															-                )
														
 
															-            assert vocoder is not None
														
 
															-            vocoder.eval()
														
 
															-
														
 
															         return {
														
 
															             "unity_model": unity_model,
														
 
															             "unity_config": unity_config,
														
@@ -170,7 +149,6 @@ class UnitYPipelineMixin:
 
															             "monotonic_decoder_config": monotonic_decoder_config,
														
 
															             "text_tokenizer": text_tokenizer,
														
 
															             "unit_tokenizer": unit_tokenizer,
														
 
															-            "vocoder": vocoder,
														
 
															         }
														
--- a/src/seamless_communication/streaming/dataloaders/s2tt.py
+++ b/src/seamless_communication/streaming/dataloaders/s2tt.py
@@ -55,7 +55,13 @@ class SileroVADSilenceRemover:
 
															             onnx=False,
														
 
															         )
														
 
															-    def __call__(self, sample_list: List[float]) -> List[float]:
														
 
															+    def __call__(self, sample: torch.Tensor, is_standardized: bool) -> List[float]:
														
 
															+        if not is_standardized:
														
 
															+            # Standardizing here just for getting silence boundaries
														
 
															+            standarized_sample_list = F.layer_norm(sample, sample.shape).tolist()
														
 
															+        else:
														
 
															+            standarized_sample_list = sample.tolist()
														
 
															+
														
 
															         (
														
 
															             get_speech_timestamps,
														
 
															             save_audio,
														
@@ -64,8 +70,10 @@ class SileroVADSilenceRemover:
 
															             collect_chunks,
														
 
															         ) = self.utils
														
 
															         speech_timestamps = get_speech_timestamps(
														
 
															-            sample_list, self.model, sampling_rate=self.sample_rate
														
 
															+            standarized_sample_list, self.model, sampling_rate=self.sample_rate
														
 
															         )
														
 
															+
														
 
															+        sample_list: List[float] = sample.tolist()
														
 
															         if len(speech_timestamps) == 0:
														
 
															             return sample_list
														
 
															         speech_start_time = speech_timestamps[0]["start"]
														
@@ -75,7 +83,9 @@ class SileroVADSilenceRemover:
 
															 @register_dataloader("fairseq2_s2tt")
														
 
															 class SimulEvalSpeechToTextDataloader(SpeechToTextDataloader, IterableDataloader):  # type: ignore
														
 
															-    def __init__(self, data_pipeline: DataPipeline, args: Namespace) -> None:
														
 
															+    def __init__(
														
 
															+        self, data_pipeline: DataPipeline, is_standardized: bool, args: Namespace
														
 
															+    ) -> None:
														
 
															         self.args = args
														
 
															         self.data_file: Path = Path(getattr(self.args, "data_file", ""))
														
 
															         if not self.data_file.exists():
														
@@ -83,10 +93,12 @@ class SimulEvalSpeechToTextDataloader(SpeechToTextDataloader, IterableDataloader
 
															         self.start_index: int = getattr(self.args, "start_index", 0)
														
 
															         self.end_index: int = getattr(self.args, "end_index", -1)
														
 
															         self.data_pipeline = data_pipeline
														
 
															+        self.is_standardized = is_standardized
														
 
															         self.data_itr = iter(self.data_pipeline)
														
 
															         self.cur_index = self.start_index - 1
														
 
															+        self.no_strip_silence = self.args.no_strip_silence
														
 
															         self.silence_remover = None
														
 
															-        if self.args.strip_silence:
														
 
															+        if not self.no_strip_silence:
														
 
															             logger.warn(
														
 
															                 "Stripping silence in the beginning and the end of audio with SileroVAD."
														
 
															             )
														
@@ -113,12 +125,12 @@ class SimulEvalSpeechToTextDataloader(SpeechToTextDataloader, IterableDataloader
 
															         return self.end_index - self.start_index
														
 
															     def get_source(self, index: Optional[int] = None) -> List[float]:
														
 
															-        source: List[float] = (
														
 
															-            self.item["audio"]["data"]["waveform"]["seqs"].squeeze().tolist()
														
 
															-        )
														
 
															+        squeezed_item = self.item["audio"]["data"]["waveform"]["seqs"].squeeze()
														
 
															-        if self.silence_remover is not None:
														
 
															-            source = self.silence_remover(source)
														
 
															+        if not self.no_strip_silence and self.silence_remover is not None:
														
 
															+            source = self.silence_remover(squeezed_item, self.is_standardized)
														
 
															+        else:
														
 
															+            source = squeezed_item.tolist()
														
 
															         return source
														
@@ -168,10 +180,13 @@ class SimulEvalSpeechToTextDataloader(SpeechToTextDataloader, IterableDataloader
 
															             selector="audio.data",
														
 
															         )
														
 
															-        pipeline_builder.map(
														
 
															-            lambda x: F.layer_norm(x, x.shape),
														
 
															-            selector="audio.data.waveform",
														
 
															-        )
														
 
															+        is_standardized = False
														
 
															+        if args.standardize_audio:
														
 
															+            pipeline_builder.map(
														
 
															+                lambda x: F.layer_norm(x, x.shape),
														
 
															+                selector="audio.data.waveform",
														
 
															+            )
														
 
															+            is_standardized = True
														
 
															         collate = Collater(pad_value=0, pad_to_multiple=1)
														
@@ -181,7 +196,7 @@ class SimulEvalSpeechToTextDataloader(SpeechToTextDataloader, IterableDataloader
 
															         data_pipeline = pipeline_builder.and_return()
														
 
															-        return cls(data_pipeline, args)
														
 
															+        return cls(data_pipeline, is_standardized, args)
														
 
															     @staticmethod
														
 
															     def add_args(parser: ArgumentParser) -> None:
														
@@ -222,8 +237,13 @@ class SimulEvalSpeechToTextDataloader(SpeechToTextDataloader, IterableDataloader
 
															             help="Output directory. Required if using iterable dataloader.",
														
 
															         )
														
 
															         parser.add_argument(
														
 
															-            "--strip-silence",
														
 
															+            "--no-strip-silence",
														
 
															             action="store_true",
														
 
															             default=False,
														
 
															             help="Strip silence in the beginning and the end of audio.",
														
 
															         )
														
 
															+        parser.add_argument(
														
 
															+            "--standardize-audio",
														
 
															+            action="store_true",
														
 
															+            help="Standardize audio.",
														
 
															+        )