1 năm trước cách đây · 732d7bd5a5
--- a/src/seamless_communication/cli/eval_utils/compute_metrics.py
+++ b/src/seamless_communication/cli/eval_utils/compute_metrics.py
@@ -4,23 +4,23 @@
 
				 # This source code is licensed under the license found in the
			
 
				 # LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				+import json
			
 
				 import logging
			
 
				+from pathlib import Path
			
 
				+from typing import Optional, Tuple, Union
			
 
				+
			
 
				 import pandas as pd
			
 
				 import whisper
			
 
				-
			
 
				 from fairseq2.typing import Device
			
 
				 from jiwer import cer, wer
			
 
				-from pathlib import Path
			
 
				 from sacrebleu.metrics.base import Score, Signature
			
 
				 from sacrebleu.metrics.bleu import BLEU
			
 
				 from sacrebleu.metrics.chrf import CHRF
			
 
				 from seamless_communication.cli.eval_utils.lang_mapping import LANG3_LANG2
			
 
				 from tqdm import tqdm
			
 
				-from typing import Optional, Tuple, Union
			
 
				 from whisper import Whisper
			
 
				 from whisper.normalizers import BasicTextNormalizer, EnglishTextNormalizer
			
 
				 
			
 
				-
			
 
				 logging.basicConfig(
			
 
				     level=logging.INFO,
			
 
				     format="%(asctime)s %(levelname)s -- %(name)s: %(message)s",
			
@@ -190,7 +190,7 @@ def compute_asr_error_rate(
 
				     ref_text_series: pd.Series,
			
 
				     lang: str,
			
 
				     whisper_normalize_text: bool = True,
			
 
				-) -> Tuple[Score, str]:
			
 
				+) -> Tuple[float, str]:
			
 
				     """Wraps normalization functions and computes ASR WER/CER score
			
 
				     Args:
			
 
				         hyp_text_series (pd.Series): each line contains s2t model prediction or first pass prediction
			
@@ -348,17 +348,24 @@ def compute_quality_metrics(
 
				         logger.info(f"{task} ASR Normalized BLEU:\n{asr_bleu_normalized_json}")
			
 
				 
			
 
				     if task == "ASR":
			
 
				-        _, asr_error_rate_signature = compute_asr_error_rate(
			
 
				+        asr_error_rate, asr_error_rate_signature = compute_asr_error_rate(
			
 
				             hyp_text_series=df[pred_text_col_name],
			
 
				             ref_text_series=df[ref_text_col_name],
			
 
				             lang=tgt_lang,
			
 
				             whisper_normalize_text=whisper_normalize_text_output,
			
 
				         )
			
 
				+        d = {
			
 
				+            "name": "WER",
			
 
				+            "score": asr_error_rate,
			
 
				+            "signature": asr_error_rate_signature,
			
 
				+        }
			
 
				+        asr_error_rate_json = json.dumps(d, indent=1, ensure_ascii=False)
			
 
				+
			
 
				         filename = "asr_error_rate.json"
			
 
				 
			
 
				         with open(output_path / filename, "w") as f:
			
 
				-            f.write(asr_error_rate_signature)
			
 
				+            f.write(asr_error_rate_json)
			
 
				 
			
 
				-        logger.info(f"ASR : {asr_error_rate_signature}")
			
 
				+        logger.info(f"ASR : {asr_error_rate_json}")
			
 
				 
			
 
				     return filename
			
--- a/src/seamless_communication/cli/streaming/README.md
+++ b/src/seamless_communication/cli/streaming/README.md
@@ -0,0 +1,45 @@
 
				+# Evaluating SeamlessStreaming and Seamless models
			
 
				+SeamlessStreaming is the streaming only model and Seamless is the expressive streaming model.
			
 
				+
			
 
				+## Quick start:
			
 
				+
			
 
				+Evaluation can be run with the `streaming_evaluate` CLI.
			
 
				+
			
 
				+We use the `seamless_streaming_unity` for loading the speech encoder and T2U models, and `seamless_streaming_monotonic_decoder` for loading the text decoder for streaming evaluation. This is already set as defaults for the `streaming_evaluate` CLI, but can be overridden using the `--unity-model-name` and  `--monotonic-decoder-model-name` args if required.
			
 
				+
			
 
				+Note that the numbers in the paper use single precision floating point format (fp32) for evaluation by setting `--dtype fp32`.
			
 
				+
			
 
				+### S2TT:
			
 
				+Set the task to `s2tt` for evaluating the speech-to-text translation part of the SeamlessStreaming model.
			
 
				+
			
 
				+```bash
			
 
				+streaming_evaluate --task s2tt --data-file <path_to_data_tsv_file> --audio-root-dir <path_to_audio_root_directory> --output <path_to_evaluation_output_directory> --tgt-lang <3_letter_lang_code>
			
 
				+```
			
 
				+
			
 
				+Note: The `--ref-field` can be used to specify the name of the reference column in the dataset.
			
 
				+
			
 
				+### ASR:
			
 
				+Set the task to `asr` for evaluating the automatic speech recognition part of the SeamlessStreaming model. Make sure to pass the source language as the `--tgt-lang` arg.
			
 
				+
			
 
				+```bash
			
 
				+streaming_evaluate --task s2tt --data-file <path_to_data_tsv_file> --audio-root-dir <path_to_audio_root_directory> --output <path_to_evaluation_output_directory> --tgt-lang <3_letter_source_lang_code> 
			
 
				+```
			
 
				+
			
 
				+### S2ST:
			
 
				+
			
 
				+#### SeamlessStreaming:
			
 
				+
			
 
				+Set the task to `s2st` for evaluating the speech-to-speech translation part of the SeamlessStreaming model. 
			
 
				+
			
 
				+```bash
			
 
				+streaming_evaluate --task s2st --data-file <path_to_data_tsv_file> --audio-root-dir <path_to_audio_root_directory> --output <path_to_evaluation_output_directory> --tgt-lang <3_letter_lang_code>
			
 
				+```
			
 
				+
			
 
				+#### Seamless:
			
 
				+The Seamless model is an unified model for streaming expressive speech-to-speech tranlsation. Use the `--expressive` arg for running evaluation of this unified model.
			
 
				+
			
 
				+```bash
			
 
				+streaming_evaluate --task s2st --data-file <path_to_data_tsv_file> --audio-root-dir <path_to_audio_root_directory> --output <path_to_evaluation_output_directory> --tgt-lang <3_letter_lang_code> --expressive
			
 
				+```
			
 
				+
			
 
				+Note: In the current version of our paper, we use vocoder_pretssel_16khz for the evaluation , so in order to reproduce those results please add this arg to the above command: `--vocoder-name vocoder_pretssel_16khz`
			
--- a/src/seamless_communication/cli/streaming/evaluate.py
+++ b/src/seamless_communication/cli/streaming/evaluate.py
@@ -5,19 +5,29 @@
 
				 # LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				 import argparse
			
 
				+import logging
			
 
				 
			
 
				 from fairseq2.assets import asset_store, download_manager
			
 
				-from seamless_communication.streaming.agents.mma_m4t_s2st import (
			
 
				-    MonotonicM4TS2STAgent,
			
 
				-    SeamlessS2STAgent,
			
 
				-)
			
 
				 from seamless_communication.cli.streaming.scorers.seamless_quality_scorer import (
			
 
				     SeamlessQualityScorer,
			
 
				 )
			
 
				-from seamless_communication.streaming.agents.mma_m4t_s2t import MonotonicM4TS2TAgent
			
 
				+from seamless_communication.streaming.agents.seamless_s2st import SeamlessS2STAgent
			
 
				+from seamless_communication.streaming.agents.seamless_streaming_s2st import (
			
 
				+    SeamlessStreamingS2STAgent,
			
 
				+)
			
 
				+from seamless_communication.streaming.agents.seamless_streaming_s2t import (
			
 
				+    SeamlessStreamingS2TAgent,
			
 
				+)
			
 
				 from simuleval.evaluator import build_evaluator
			
 
				 from simuleval.utils.agent import EVALUATION_SYSTEM_LIST, build_system_args
			
 
				 
			
 
				+logging.basicConfig(
			
 
				+    level=logging.INFO,
			
 
				+    format="%(asctime)s %(levelname)s -- %(name)s: %(message)s",
			
 
				+)
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				 
			
 
				 def main() -> None:
			
 
				     parser = argparse.ArgumentParser(
			
@@ -28,7 +38,7 @@ def main() -> None:
 
				 
			
 
				     parser.add_argument(
			
 
				         "--task",
			
 
				-        choices=["s2st", "s2tt"],
			
 
				+        choices=["s2st", "s2tt", "asr"],
			
 
				         required=True,
			
 
				         type=str,
			
 
				         help="Target language to translate/transcribe into.",
			
@@ -39,46 +49,33 @@ def main() -> None:
 
				         default=False,
			
 
				         help="Expressive streaming S2ST inference",
			
 
				     )
			
 
				-    parser.add_argument(
			
 
				-        "--dtype",
			
 
				-        default="fp16",
			
 
				-        type=str,
			
 
				-    )
			
 
				 
			
 
				     args, _ = parser.parse_known_args()
			
 
				 
			
 
				     model_configs = dict(
			
 
				         source_segment_size=320,
			
 
				         device="cuda:0",
			
 
				-        dtype=args.dtype,
			
 
				+        dtype="fp16",
			
 
				         min_starting_wait_w2vbert=192,
			
 
				         decision_threshold=0.5,
			
 
				         no_early_stop=True,
			
 
				-        max_len_a=1,
			
 
				-        max_len_b=200,
			
 
				+        max_len_a=0,
			
 
				+        max_len_b=100,
			
 
				     )
			
 
				 
			
 
				-    if args.dtype == "fp16":
			
 
				-        model_configs.update(dict(fp16=True))
			
 
				-
			
 
				     EVALUATION_SYSTEM_LIST.clear()
			
 
				     eval_configs = dict(quality_metrics="SEAMLESS_QUALITY_SCORER")
			
 
				     if args.task == "s2st":
			
 
				-        model_configs.update(
			
 
				-            dict(
			
 
				-                min_unit_chunk_size=50,
			
 
				-            )
			
 
				-        )
			
 
				+        model_configs["min_unit_chunk_size"] = 50
			
 
				         eval_configs["latency_metrics"] = "StartOffset EndOffset"
			
 
				 
			
 
				         if args.expressive:
			
 
				             EVALUATION_SYSTEM_LIST.append(SeamlessS2STAgent)
			
 
				-            model_configs.update(dict(vocoder_name="vocoder_pretssel"))
			
 
				         else:
			
 
				-            EVALUATION_SYSTEM_LIST.append(MonotonicM4TS2STAgent)
			
 
				-    elif args.task == "s2tt":
			
 
				+            EVALUATION_SYSTEM_LIST.append(SeamlessStreamingS2STAgent)
			
 
				+    elif args.task in ["s2tt", "asr"]:
			
 
				         assert args.expressive is False, "S2TT inference cannot be expressive."
			
 
				-        EVALUATION_SYSTEM_LIST.append(MonotonicM4TS2TAgent)
			
 
				+        EVALUATION_SYSTEM_LIST.append(SeamlessStreamingS2TAgent)
			
 
				         parser.add_argument(
			
 
				             "--unity-model-name",
			
 
				             type=str,
			
@@ -104,6 +101,9 @@ def main() -> None:
 
				         {**base_config, **model_configs, **eval_configs}, parser
			
 
				     )
			
 
				 
			
 
				+    if args.fp16:
			
 
				+        logger.warn("--fp16 arg will be ignorned, use --dtype instead")
			
 
				+
			
 
				     evaluator = build_evaluator(args)
			
 
				     evaluator(system)
			
 
				 
			
--- a/src/seamless_communication/cli/streaming/scorers/seamless_quality_scorer.py
+++ b/src/seamless_communication/cli/streaming/scorers/seamless_quality_scorer.py
@@ -5,23 +5,19 @@
 
				 # LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				 from __future__ import annotations
			
 
				-import pandas
			
 
				-from fairseq2.typing import Device
			
 
				-from pathlib import Path
			
 
				-from typing import Optional
			
 
				+
			
 
				 import json
			
 
				 from argparse import ArgumentParser, Namespace
			
 
				-from typing import Dict
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, Optional
			
 
				 
			
 
				+import pandas
			
 
				+from fairseq2.typing import Device
			
 
				+from seamless_communication.cli.eval_utils import compute_quality_metrics
			
 
				+from simuleval.evaluator.instance import LogInstance
			
 
				 from simuleval.evaluator.scorers.quality_scorer import (
			
 
				-    register_quality_scorer,
			
 
				     QualityScorer,
			
 
				-)
			
 
				-
			
 
				-from simuleval.evaluator.instance import LogInstance
			
 
				-
			
 
				-from seamless_communication.cli.eval_utils import (
			
 
				-    compute_quality_metrics,
			
 
				+    register_quality_scorer,
			
 
				 )
			
 
				 
			
 
				 
			
@@ -90,19 +86,13 @@ class SeamlessQualityScorer(QualityScorer):  # type: ignore
 
				 
			
 
				     @staticmethod
			
 
				     def add_args(parser: ArgumentParser) -> None:
			
 
				-        try:
			
 
				-            parser.add_argument(
			
 
				-                "--task", type=str, help="Task to evaluate", required=True
			
 
				-            )
			
 
				-            parser.add_argument(
			
 
				-                "--tgt-lang",
			
 
				-                type=str,
			
 
				-                help="Target language to translate/transcribe into.",
			
 
				-                required=True,
			
 
				-            )
			
 
				-        except:
			
 
				-            pass
			
 
				-
			
 
				+        parser.add_argument("--task", type=str, help="Task to evaluate", required=True)
			
 
				+        parser.add_argument(
			
 
				+            "--tgt-lang",
			
 
				+            type=str,
			
 
				+            help="Target language to translate/transcribe into.",
			
 
				+            required=True,
			
 
				+        )
			
 
				         parser.add_argument(
			
 
				             "--whisper-model-name", type=str, help="Whisper model name", default="large"
			
 
				         )
			
--- a/src/seamless_communication/streaming/agents/online_vocoder.py
+++ b/src/seamless_communication/streaming/agents/online_vocoder.py
@@ -5,21 +5,37 @@
 
				 # LICENSE file in the root directory of this source tree.
			
 
				 from __future__ import annotations
			
 
				 
			
 
				+import logging
			
 
				 from argparse import ArgumentParser, Namespace
			
 
				 from typing import Any, Dict
			
 
				-import torch
			
 
				 
			
 
				-from seamless_communication.models.vocoder.vocoder import Vocoder
			
 
				+import torch
			
 
				+from seamless_communication.models.vocoder.loader import load_vocoder_model
			
 
				 from simuleval.agents import AgentStates, TextToSpeechAgent
			
 
				 from simuleval.agents.actions import ReadAction, WriteAction
			
 
				 from simuleval.data.segments import SpeechSegment
			
 
				 
			
 
				+logging.basicConfig(
			
 
				+    level=logging.INFO,
			
 
				+    format="%(asctime)s %(levelname)s -- %(name)s: %(message)s",
			
 
				+)
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				 
			
 
				 class VocoderAgent(TextToSpeechAgent):  # type: ignore
			
 
				-    def __init__(self, vocoder: Vocoder, args: Namespace) -> None:
			
 
				+    def __init__(self, args: Namespace) -> None:
			
 
				         super().__init__(args)
			
 
				+
			
 
				+        logger.info(
			
 
				+            f"Loading the Vocoder model: {args.vocoder_name} on device={args.device}, dtype={args.dtype}"
			
 
				+        )
			
 
				+        self.vocoder = load_vocoder_model(
			
 
				+            args.vocoder_name, device=args.device, dtype=args.dtype
			
 
				+        )
			
 
				+        self.vocoder.eval()
			
 
				+
			
 
				         self.sample_rate = args.sample_rate
			
 
				-        self.vocoder = vocoder
			
 
				         self.tgt_lang = args.tgt_lang
			
 
				         self.speaker_id = args.vocoder_speaker_id
			
 
				 
			
@@ -54,6 +70,12 @@ class VocoderAgent(TextToSpeechAgent):  # type: ignore
 
				 
			
 
				     @classmethod
			
 
				     def add_args(cls, parser: ArgumentParser) -> None:
			
 
				+        parser.add_argument(
			
 
				+            "--vocoder-name",
			
 
				+            type=str,
			
 
				+            help="Vocoder name.",
			
 
				+            default="vocoder_v2",
			
 
				+        )
			
 
				         parser.add_argument(
			
 
				             "--vocoder-speaker-id",
			
 
				             type=int,
			
@@ -64,6 +86,4 @@ class VocoderAgent(TextToSpeechAgent):  # type: ignore
 
				 
			
 
				     @classmethod
			
 
				     def from_args(cls, args: Namespace, **kwargs: Dict[str, Any]) -> VocoderAgent:
			
 
				-        vocoder = kwargs.get("vocoder", None)
			
 
				-        assert isinstance(vocoder, Vocoder)
			
 
				-        return cls(vocoder, args)
			
 
				+        return cls(args)
			
--- a/src/seamless_communication/streaming/agents/pretssel_vocoder.py
+++ b/src/seamless_communication/streaming/agents/pretssel_vocoder.py
@@ -5,27 +5,46 @@
 
				 # LICENSE file in the root directory of this source tree.
			
 
				 from __future__ import annotations
			
 
				 
			
 
				+import logging
			
 
				 from argparse import ArgumentParser, Namespace
			
 
				 from typing import Any, Dict, List
			
 
				 
			
 
				 import torch
			
 
				+from fairseq2.assets import asset_store
			
 
				 from fairseq2.data.audio import WaveformToFbankConverter, WaveformToFbankInput
			
 
				-from seamless_communication.models.generator.vocoder import PretsselVocoder
			
 
				+from seamless_communication.models.generator.loader import load_pretssel_vocoder_model
			
 
				 from seamless_communication.models.unity import load_gcmvn_stats
			
 
				-from seamless_communication.models.vocoder.vocoder import Vocoder
			
 
				 from seamless_communication.streaming.agents.common import NoUpdateTargetMixin
			
 
				 from simuleval.agents import AgentStates, TextToSpeechAgent
			
 
				 from simuleval.agents.actions import ReadAction, WriteAction
			
 
				 from simuleval.data.segments import SpeechSegment
			
 
				 
			
 
				+logging.basicConfig(
			
 
				+    level=logging.INFO,
			
 
				+    format="%(asctime)s %(levelname)s -- %(name)s: %(message)s",
			
 
				+)
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				 
			
 
				 class PretsselVocoderAgent(NoUpdateTargetMixin, TextToSpeechAgent):  # type: ignore
			
 
				-    def __init__(self, vocoder: Vocoder, args: Namespace) -> None:
			
 
				+    def __init__(self, args: Namespace) -> None:
			
 
				         super().__init__(args)
			
 
				-        self.vocoder = vocoder
			
 
				+
			
 
				+        logger.info(
			
 
				+            f"Loading the Vocoder model: {args.vocoder_name} on device={args.device}, dtype={args.dtype}"
			
 
				+        )
			
 
				+        assert "pretssel" in args.vocoder_name
			
 
				+        self.vocoder = load_pretssel_vocoder_model(
			
 
				+            args.vocoder_name, device=args.device, dtype=args.dtype
			
 
				+        )
			
 
				+        self.vocoder.eval()
			
 
				+
			
 
				+        vocoder_model_card = asset_store.retrieve_card(args.vocoder_name)
			
 
				+        self.vocoder_sample_rate = vocoder_model_card.field("sample_rate").as_(int)
			
 
				+
			
 
				         self.upstream_idx = args.upstream_idx
			
 
				         self.sample_rate = args.sample_rate  # input sample rate
			
 
				-        self.vocoder_sample_rate = args.vocoder_sample_rate  # output sample rate
			
 
				         self.tgt_lang = args.tgt_lang
			
 
				         self.convert_to_fbank = WaveformToFbankConverter(
			
 
				             num_mel_bins=80,
			
@@ -110,23 +129,21 @@ class PretsselVocoderAgent(NoUpdateTargetMixin, TextToSpeechAgent):  # type: ign
 
				 
			
 
				     @classmethod
			
 
				     def add_args(cls, parser: ArgumentParser) -> None:
			
 
				+        parser.add_argument(
			
 
				+            "--vocoder-name",
			
 
				+            type=str,
			
 
				+            help="Vocoder name.",
			
 
				+            default="vocoder_pretssel",
			
 
				+        )
			
 
				         parser.add_argument(
			
 
				             "--upstream-idx",
			
 
				             type=int,
			
 
				             default=0,
			
 
				             help="index of encoder states where states.source contains input audio",
			
 
				         )
			
 
				-        parser.add_argument(
			
 
				-            "--vocoder-sample-rate",
			
 
				-            type=int,
			
 
				-            default=16000,
			
 
				-            help="sample rate out of the vocoder",
			
 
				-        )
			
 
				 
			
 
				     @classmethod
			
 
				     def from_args(
			
 
				         cls, args: Namespace, **kwargs: Dict[str, Any]
			
 
				     ) -> PretsselVocoderAgent:
			
 
				-        vocoder = kwargs.get("vocoder", None)
			
 
				-        assert isinstance(vocoder, PretsselVocoder)
			
 
				-        return cls(vocoder, args)
			
 
				+        return cls(args)
			
--- a/src/seamless_communication/streaming/agents/seamless_streaming_s2st.py
+++ b/src/seamless_communication/streaming/agents/seamless_streaming_s2st.py
@@ -4,6 +4,7 @@
 
				 # This source code is licensed under the license found in the
			
 
				 # LICENSE file in the root directory of this source tree.
			
 
				 
			
 
				+from seamless_communication.streaming.agents.detokenizer import UnitYDetokenizerAgent
			
 
				 from seamless_communication.streaming.agents.offline_w2v_bert_encoder import (
			
 
				     OfflineWav2VecBertEncoderAgent,
			
 
				 )
			
@@ -16,19 +17,15 @@ from seamless_communication.streaming.agents.online_text_decoder import (
 
				 from seamless_communication.streaming.agents.online_unit_decoder import (
			
 
				     NARUnitYUnitDecoderAgent,
			
 
				 )
			
 
				-from seamless_communication.streaming.agents.silero_vad import SileroVADAgent
			
 
				 from seamless_communication.streaming.agents.online_vocoder import VocoderAgent
			
 
				-from seamless_communication.streaming.agents.pretssel_vocoder import PretsselVocoderAgent
			
 
				-
			
 
				-from seamless_communication.streaming.agents.detokenizer import UnitYDetokenizerAgent
			
 
				+from seamless_communication.streaming.agents.silero_vad import SileroVADAgent
			
 
				 from seamless_communication.streaming.agents.unity_pipeline import (
			
 
				     UnitYAgentPipeline,
			
 
				     UnitYAgentTreePipeline,
			
 
				 )
			
 
				-from simuleval.utils import entrypoint
			
 
				 
			
 
				 
			
 
				-class MonotonicM4TS2STAgent(UnitYAgentPipeline):
			
 
				+class SeamlessStreamingS2STAgent(UnitYAgentPipeline):
			
 
				     pipeline = [
			
 
				         OnlineFeatureExtractorAgent,
			
 
				         OfflineWav2VecBertEncoderAgent,
			
@@ -38,17 +35,7 @@ class MonotonicM4TS2STAgent(UnitYAgentPipeline):
 
				     ]
			
 
				 
			
 
				 
			
 
				-class SeamlessS2STAgent(UnitYAgentPipeline):
			
 
				-    pipeline = [
			
 
				-        OnlineFeatureExtractorAgent,
			
 
				-        OfflineWav2VecBertEncoderAgent,
			
 
				-        UnitYMMATextDecoderAgent,
			
 
				-        NARUnitYUnitDecoderAgent,
			
 
				-        PretsselVocoderAgent,
			
 
				-    ]
			
 
				-
			
 
				-
			
 
				-class MonotonicM4TS2STVADAgent(UnitYAgentPipeline):
			
 
				+class SeamlessStreamingS2STVADAgent(UnitYAgentPipeline):
			
 
				     pipeline = [
			
 
				         SileroVADAgent,
			
 
				         OnlineFeatureExtractorAgent,
			
@@ -59,7 +46,7 @@ class MonotonicM4TS2STVADAgent(UnitYAgentPipeline):
 
				     ]
			
 
				 
			
 
				 
			
 
				-class MonotonicM4TS2STJointVADAgent(UnitYAgentTreePipeline):
			
 
				+class SeamlessStreamingS2STJointVADAgent(UnitYAgentTreePipeline):
			
 
				     pipeline = {
			
 
				         SileroVADAgent: [OnlineFeatureExtractorAgent],
			
 
				         OnlineFeatureExtractorAgent: [OfflineWav2VecBertEncoderAgent],
			
@@ -69,15 +56,3 @@ class MonotonicM4TS2STJointVADAgent(UnitYAgentTreePipeline):
 
				         NARUnitYUnitDecoderAgent: [VocoderAgent],
			
 
				         VocoderAgent: [],
			
 
				     }
			
 
				-
			
 
				-
			
 
				-class SeamlessS2STJointVADAgent(UnitYAgentTreePipeline):
			
 
				-    pipeline = {
			
 
				-        SileroVADAgent: [OnlineFeatureExtractorAgent],
			
 
				-        OnlineFeatureExtractorAgent: [OfflineWav2VecBertEncoderAgent],
			
 
				-        OfflineWav2VecBertEncoderAgent: [UnitYMMATextDecoderAgent],
			
 
				-        UnitYMMATextDecoderAgent: [UnitYDetokenizerAgent, NARUnitYUnitDecoderAgent],
			
 
				-        UnitYDetokenizerAgent: [],
			
 
				-        NARUnitYUnitDecoderAgent: [PretsselVocoderAgent],
			
 
				-        PretsselVocoderAgent: [],
			
 
				-    }
			
--- a/src/seamless_communication/streaming/agents/seamless_streaming_s2t.py
+++ b/src/seamless_communication/streaming/agents/seamless_streaming_s2t.py
@@ -16,11 +16,9 @@ from seamless_communication.streaming.agents.online_text_decoder import (
 
				 )
			
 
				 from seamless_communication.streaming.agents.silero_vad import SileroVADAgent
			
 
				 from seamless_communication.streaming.agents.unity_pipeline import UnitYAgentPipeline
			
 
				-from simuleval.utils import entrypoint
			
 
				 
			
 
				 
			
 
				-@entrypoint
			
 
				-class MonotonicM4TS2TDetokAgent(UnitYAgentPipeline):
			
 
				+class SeamlessStreamingS2TDetokAgent(UnitYAgentPipeline):
			
 
				     pipeline = [
			
 
				         OnlineFeatureExtractorAgent,
			
 
				         OfflineWav2VecBertEncoderAgent,
			
@@ -29,8 +27,7 @@ class MonotonicM4TS2TDetokAgent(UnitYAgentPipeline):
 
				     ]
			
 
				 
			
 
				 
			
 
				-@entrypoint
			
 
				-class MonotonicM4TS2TAgent(UnitYAgentPipeline):
			
 
				+class SeamlessStreamingS2TAgent(UnitYAgentPipeline):
			
 
				     pipeline = [
			
 
				         OnlineFeatureExtractorAgent,
			
 
				         OfflineWav2VecBertEncoderAgent,
			
@@ -38,7 +35,7 @@ class MonotonicM4TS2TAgent(UnitYAgentPipeline):
 
				     ]
			
 
				 
			
 
				 
			
 
				-class MonotonicM4TS2TVADAgent(UnitYAgentPipeline):
			
 
				+class SeamlessStreamingS2TVADAgent(UnitYAgentPipeline):
			
 
				     pipeline = [
			
 
				         SileroVADAgent,
			
 
				         OnlineFeatureExtractorAgent,
			
--- a/src/seamless_communication/streaming/agents/unity_pipeline.py
+++ b/src/seamless_communication/streaming/agents/unity_pipeline.py
@@ -74,12 +74,7 @@ class UnitYPipelineMixin:
 
				             help="Monotonic decoder model name.",
			
 
				             default="seamless_streaming_monotonic_decoder",
			
 
				         )
			
 
				-        parser.add_argument(
			
 
				-            "--vocoder-name",
			
 
				-            type=str,
			
 
				-            help="Vocoder name.",
			
 
				-            default="vocoder_v2",
			
 
				-        )
			
 
				+
			
 
				         parser.add_argument(
			
 
				             "--sample-rate",
			
 
				             default=16000,
			
@@ -147,22 +142,6 @@ class UnitYPipelineMixin:
 
				         )
			
 
				         monotonic_decoder_model.eval()
			
 
				 
			
 
				-        vocoder: Optional[Union[PretsselVocoder, Vocoder]] = None
			
 
				-        if args.vocoder_name is not None and output_modality == Modality.SPEECH:
			
 
				-            logger.info(
			
 
				-                f"Loading the Vocoder model: {args.vocoder_name} on device={args.device}, dtype={args.dtype}"
			
 
				-            )
			
 
				-            if "pretssel" in args.vocoder_name:
			
 
				-                vocoder = load_pretssel_vocoder_model(
			
 
				-                    args.vocoder_name, device=args.device, dtype=args.dtype
			
 
				-                )
			
 
				-            else:
			
 
				-                vocoder = load_vocoder_model(
			
 
				-                    args.vocoder_name, device=args.device, dtype=args.dtype
			
 
				-                )
			
 
				-            assert vocoder is not None
			
 
				-            vocoder.eval()
			
 
				-
			
 
				         return {
			
 
				             "unity_model": unity_model,
			
 
				             "unity_config": unity_config,
			
@@ -170,7 +149,6 @@ class UnitYPipelineMixin:
 
				             "monotonic_decoder_config": monotonic_decoder_config,
			
 
				             "text_tokenizer": text_tokenizer,
			
 
				             "unit_tokenizer": unit_tokenizer,
			
 
				-            "vocoder": vocoder,
			
 
				         }
			
 
				 
			
 
				 
			
--- a/src/seamless_communication/streaming/dataloaders/s2tt.py
+++ b/src/seamless_communication/streaming/dataloaders/s2tt.py
@@ -55,7 +55,13 @@ class SileroVADSilenceRemover:
 
				             onnx=False,
			
 
				         )
			
 
				 
			
 
				-    def __call__(self, sample_list: List[float]) -> List[float]:
			
 
				+    def __call__(self, sample: torch.Tensor, is_standardized: bool) -> List[float]:
			
 
				+        if not is_standardized:
			
 
				+            # Standardizing here just for getting silence boundaries
			
 
				+            standarized_sample_list = F.layer_norm(sample, sample.shape).tolist()
			
 
				+        else:
			
 
				+            standarized_sample_list = sample.tolist()
			
 
				+
			
 
				         (
			
 
				             get_speech_timestamps,
			
 
				             save_audio,
			
@@ -64,8 +70,10 @@ class SileroVADSilenceRemover:
 
				             collect_chunks,
			
 
				         ) = self.utils
			
 
				         speech_timestamps = get_speech_timestamps(
			
 
				-            sample_list, self.model, sampling_rate=self.sample_rate
			
 
				+            standarized_sample_list, self.model, sampling_rate=self.sample_rate
			
 
				         )
			
 
				+
			
 
				+        sample_list: List[float] = sample.tolist()
			
 
				         if len(speech_timestamps) == 0:
			
 
				             return sample_list
			
 
				         speech_start_time = speech_timestamps[0]["start"]
			
@@ -75,7 +83,9 @@ class SileroVADSilenceRemover:
 
				 
			
 
				 @register_dataloader("fairseq2_s2tt")
			
 
				 class SimulEvalSpeechToTextDataloader(SpeechToTextDataloader, IterableDataloader):  # type: ignore
			
 
				-    def __init__(self, data_pipeline: DataPipeline, args: Namespace) -> None:
			
 
				+    def __init__(
			
 
				+        self, data_pipeline: DataPipeline, is_standardized: bool, args: Namespace
			
 
				+    ) -> None:
			
 
				         self.args = args
			
 
				         self.data_file: Path = Path(getattr(self.args, "data_file", ""))
			
 
				         if not self.data_file.exists():
			
@@ -83,10 +93,12 @@ class SimulEvalSpeechToTextDataloader(SpeechToTextDataloader, IterableDataloader
 
				         self.start_index: int = getattr(self.args, "start_index", 0)
			
 
				         self.end_index: int = getattr(self.args, "end_index", -1)
			
 
				         self.data_pipeline = data_pipeline
			
 
				+        self.is_standardized = is_standardized
			
 
				         self.data_itr = iter(self.data_pipeline)
			
 
				         self.cur_index = self.start_index - 1
			
 
				+        self.no_strip_silence = self.args.no_strip_silence
			
 
				         self.silence_remover = None
			
 
				-        if self.args.strip_silence:
			
 
				+        if not self.no_strip_silence:
			
 
				             logger.warn(
			
 
				                 "Stripping silence in the beginning and the end of audio with SileroVAD."
			
 
				             )
			
@@ -113,12 +125,12 @@ class SimulEvalSpeechToTextDataloader(SpeechToTextDataloader, IterableDataloader
 
				         return self.end_index - self.start_index
			
 
				 
			
 
				     def get_source(self, index: Optional[int] = None) -> List[float]:
			
 
				-        source: List[float] = (
			
 
				-            self.item["audio"]["data"]["waveform"]["seqs"].squeeze().tolist()
			
 
				-        )
			
 
				+        squeezed_item = self.item["audio"]["data"]["waveform"]["seqs"].squeeze()
			
 
				 
			
 
				-        if self.silence_remover is not None:
			
 
				-            source = self.silence_remover(source)
			
 
				+        if not self.no_strip_silence and self.silence_remover is not None:
			
 
				+            source = self.silence_remover(squeezed_item, self.is_standardized)
			
 
				+        else:
			
 
				+            source = squeezed_item.tolist()
			
 
				 
			
 
				         return source
			
 
				 
			
@@ -168,10 +180,13 @@ class SimulEvalSpeechToTextDataloader(SpeechToTextDataloader, IterableDataloader
 
				             selector="audio.data",
			
 
				         )
			
 
				 
			
 
				-        pipeline_builder.map(
			
 
				-            lambda x: F.layer_norm(x, x.shape),
			
 
				-            selector="audio.data.waveform",
			
 
				-        )
			
 
				+        is_standardized = False
			
 
				+        if args.standardize_audio:
			
 
				+            pipeline_builder.map(
			
 
				+                lambda x: F.layer_norm(x, x.shape),
			
 
				+                selector="audio.data.waveform",
			
 
				+            )
			
 
				+            is_standardized = True
			
 
				 
			
 
				         collate = Collater(pad_value=0, pad_to_multiple=1)
			
 
				 
			
@@ -181,7 +196,7 @@ class SimulEvalSpeechToTextDataloader(SpeechToTextDataloader, IterableDataloader
 
				 
			
 
				         data_pipeline = pipeline_builder.and_return()
			
 
				 
			
 
				-        return cls(data_pipeline, args)
			
 
				+        return cls(data_pipeline, is_standardized, args)
			
 
				 
			
 
				     @staticmethod
			
 
				     def add_args(parser: ArgumentParser) -> None:
			
@@ -222,8 +237,13 @@ class SimulEvalSpeechToTextDataloader(SpeechToTextDataloader, IterableDataloader
 
				             help="Output directory. Required if using iterable dataloader.",
			
 
				         )
			
 
				         parser.add_argument(
			
 
				-            "--strip-silence",
			
 
				+            "--no-strip-silence",
			
 
				             action="store_true",
			
 
				             default=False,
			
 
				             help="Strip silence in the beginning and the end of audio.",
			
 
				         )
			
 
				+        parser.add_argument(
			
 
				+            "--standardize-audio",
			
 
				+            action="store_true",
			
 
				+            help="Standardize audio.",
			
 
				+        )