2 years ago · 7c64fef0d4
--- a/src/seamless_communication/cli/eval_utils/compute_metrics.py
+++ b/src/seamless_communication/cli/eval_utils/compute_metrics.py
@@ -16,7 +16,7 @@ from sacrebleu.metrics.bleu import BLEU
 
				 from sacrebleu.metrics.chrf import CHRF
			
 
				 from seamless_communication.cli.eval_utils.lang_mapping import LANG3_LANG2
			
 
				 from tqdm import tqdm
			
 
				-from typing import Tuple, Union
			
 
				+from typing import Optional, Tuple, Union
			
 
				 from whisper import Whisper
			
 
				 from whisper.normalizers import BasicTextNormalizer, EnglishTextNormalizer
			
 
				 
			
@@ -257,9 +257,9 @@ def compute_quality_metrics(
 
				     whisper_model_name: str = "large",
			
 
				     whisper_normalize_text_output: bool = False,
			
 
				     ref_text_col_name: str = "ref_tgt_text",
			
 
				-    pred_text_col_name: str = "pred_tgt_text",
			
 
				+    pred_text_col_name: Optional[str] = "pred_tgt_text",
			
 
				     pred_audio_col_name: str = "pred_tgt_audio",
			
 
				-) -> None:
			
 
				+) -> str:
			
 
				     """Wraps asr and s2t bleu functions to call it with TSV manifest composed on expressivity side
			
 
				     Args:
			
 
				         output_manifest_tsv_path (Path): output manifest which has "ref_text", "hypo_audio", "s2t_out" column names
			
@@ -337,9 +337,10 @@ def compute_quality_metrics(
 
				         asr_bleu_normalized_json = asr_bleu_normalized.format(
			
 
				             signature=asr_bleu_normalized_signature.format(), is_json=True
			
 
				         )
			
 
				+        filename = f"{task.lower()}_asr_bleu_normalized.json"
			
 
				 
			
 
				         with open(
			
 
				-            output_path / f"{task.lower()}_asr_bleu_normalized.json",
			
 
				+            output_path / filename,
			
 
				             "w",
			
 
				         ) as f:
			
 
				             f.write(asr_bleu_normalized_json)
			
@@ -353,8 +354,11 @@ def compute_quality_metrics(
 
				             lang=tgt_lang,
			
 
				             whisper_normalize_text=whisper_normalize_text_output,
			
 
				         )
			
 
				+        filename = "asr_error_rate.json"
			
 
				 
			
 
				-        with open(output_path / "asr_error_rate.json", "w") as f:
			
 
				+        with open(output_path / filename, "w") as f:
			
 
				             f.write(asr_error_rate_signature)
			
 
				 
			
 
				         logger.info(f"ASR : {asr_error_rate_signature}")
			
 
				+
			
 
				+    return filename
			
--- a/src/seamless_communication/cli/streaming/evaluate.py
+++ b/src/seamless_communication/cli/streaming/evaluate.py
@@ -7,14 +7,13 @@
 
				 import argparse
			
 
				 
			
 
				 from fairseq2.assets import asset_store, download_manager
			
 
				-from seamless_communication.cli.eval_utils import get_tokenizer
			
 
				-from seamless_communication.cli.streaming.scorers.seamless_whisper_asr_bleu import (
			
 
				-    SeamlessWhisperASRSacreBLEUScorer as SeamlessWhisperASRSacreBLEUScorer,
			
 
				-)
			
 
				 from seamless_communication.streaming.agents.mma_m4t_s2st import (
			
 
				     MonotonicM4TS2STAgent,
			
 
				     SeamlessS2STAgent,
			
 
				 )
			
 
				+from seamless_communication.cli.streaming.scorers.seamless_quality_scorer import (
			
 
				+    SeamlessQualityScorer,
			
 
				+)
			
 
				 from seamless_communication.streaming.agents.mma_m4t_s2t import MonotonicM4TS2TAgent
			
 
				 from simuleval.evaluator import build_evaluator
			
 
				 from simuleval.utils.agent import EVALUATION_SYSTEM_LIST, build_system_args
			
@@ -63,24 +62,22 @@ def main() -> None:
 
				         model_configs.update(dict(fp16=True))
			
 
				 
			
 
				     EVALUATION_SYSTEM_LIST.clear()
			
 
				+    eval_configs = dict(quality_metrics="SEAMLESS_QUALITY_SCORER")
			
 
				     if args.task == "s2st":
			
 
				         model_configs.update(
			
 
				             dict(
			
 
				                 min_unit_chunk_size=50,
			
 
				             )
			
 
				         )
			
 
				-        eval_configs = dict(
			
 
				-            quality_metrics="SEAMLESS_WHISPER_ASR_BLEU",
			
 
				-            latency_metrics="StartOffset EndOffset",
			
 
				-            whisper_model_size="large-v2",
			
 
				-            normalize_asr_bleu_references=True,
			
 
				-        )
			
 
				+        eval_configs["latency_metrics"] = "StartOffset EndOffset"
			
 
				+
			
 
				         if args.expressive:
			
 
				             EVALUATION_SYSTEM_LIST.append(SeamlessS2STAgent)
			
 
				             model_configs.update(dict(vocoder_name="vocoder_pretssel"))
			
 
				         else:
			
 
				             EVALUATION_SYSTEM_LIST.append(MonotonicM4TS2STAgent)
			
 
				     elif args.task == "s2tt":
			
 
				+        assert args.expressive is False, "S2TT inference cannot be expressive."
			
 
				         EVALUATION_SYSTEM_LIST.append(MonotonicM4TS2TAgent)
			
 
				         parser.add_argument(
			
 
				             "--unity-model-name",
			
@@ -88,24 +85,15 @@ def main() -> None:
 
				             help="Unity model name.",
			
 
				             default="seamless_streaming_unity",
			
 
				         )
			
 
				-        parser.add_argument(
			
 
				-            "--tgt-lang",
			
 
				-            default="eng",
			
 
				-            type=str,
			
 
				-            help="Target language to translate/transcribe into.",
			
 
				-        )
			
 
				         args, _ = parser.parse_known_args()
			
 
				         asset_card = asset_store.retrieve_card(name=args.unity_model_name)
			
 
				         tokenizer_uri = asset_card.field("tokenizer").as_uri()
			
 
				         tokenizer_path = download_manager.download_tokenizer(
			
 
				             tokenizer_uri, asset_card.name, force=False, progress=True
			
 
				         )
			
 
				-        eval_configs = dict(
			
 
				-            sacrebleu_tokenizer=get_tokenizer(args.tgt_lang),
			
 
				-            eval_latency_unit="spm",
			
 
				-            eval_latency_spm_model=tokenizer_path,
			
 
				-            latency_metrics="AL LAAL",
			
 
				-        )
			
 
				+        eval_configs["latency_metrics"] = "AL LAAL"
			
 
				+        eval_configs["eval_latency_unit"] = "spm"
			
 
				+        eval_configs["eval_latency_spm_model"] = tokenizer_path
			
 
				 
			
 
				     base_config = dict(
			
 
				         dataloader="fairseq2_s2tt",
			
--- a/src/seamless_communication/cli/streaming/scorers/seamless_quality_scorer.py
+++ b/src/seamless_communication/cli/streaming/scorers/seamless_quality_scorer.py
@@ -0,0 +1,146 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+import pandas
			
 
				+from fairseq2.typing import Device
			
 
				+from pathlib import Path
			
 
				+from typing import Optional
			
 
				+import json
			
 
				+from argparse import ArgumentParser, Namespace
			
 
				+from typing import Dict
			
 
				+
			
 
				+from simuleval.evaluator.scorers.quality_scorer import (
			
 
				+    register_quality_scorer,
			
 
				+    QualityScorer,
			
 
				+)
			
 
				+
			
 
				+from simuleval.evaluator.instance import LogInstance
			
 
				+
			
 
				+from seamless_communication.cli.eval_utils import (
			
 
				+    compute_quality_metrics,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+@register_quality_scorer("SEAMLESS_QUALITY_SCORER")
			
 
				+class SeamlessQualityScorer(QualityScorer):  # type: ignore
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        tgt_lang: str,
			
 
				+        task: str,
			
 
				+        output_dir: str,
			
 
				+        device: Device = "cuda:0",
			
 
				+        whisper_model_name: str = "large",
			
 
				+        whisper_normalize_text_output: Optional[bool] = None,
			
 
				+        ref_text_col_name: str = "ref_tgt_text",
			
 
				+        pred_text_col_name: str = "pred_tgt_text",
			
 
				+        pred_audio_col_name: str = "pred_tgt_audio",
			
 
				+    ) -> None:
			
 
				+        super().__init__()
			
 
				+        self.tgt_lang = tgt_lang
			
 
				+        self.task = task.upper()
			
 
				+        self.device = device
			
 
				+        self.output_dir = Path(output_dir)
			
 
				+        self.whisper_model_name = whisper_model_name
			
 
				+        self.whisper_normalize_text_output = whisper_normalize_text_output
			
 
				+        if self.whisper_normalize_text_output is None:
			
 
				+            self.whisper_normalize_text_output = (
			
 
				+                False if self.task in ["S2TT", "S2ST", "T2TT"] else True
			
 
				+            )
			
 
				+        self.ref_text_col_name = ref_text_col_name
			
 
				+        self.pred_text_col_name = pred_text_col_name
			
 
				+        self.pred_audio_col_name = pred_audio_col_name
			
 
				+
			
 
				+    def __call__(self, instances: Dict[int, LogInstance]) -> float:
			
 
				+        references = [ins.reference for ins in instances.values()]
			
 
				+        df = pandas.DataFrame({self.ref_text_col_name: references})
			
 
				+        if self.task in ["ASR", "S2TT", "T2TT"]:
			
 
				+            predictions = [ins.prediction for ins in instances.values()]
			
 
				+            df[self.pred_text_col_name] = predictions
			
 
				+        else:
			
 
				+            predictions = [ins.prediction for ins in instances.values()]
			
 
				+            df[self.pred_audio_col_name] = predictions
			
 
				+
			
 
				+        df.to_csv(
			
 
				+            self.output_dir / "results.tsv",
			
 
				+            sep="\t",
			
 
				+            quoting=3,
			
 
				+            encoding="utf-8",
			
 
				+        )
			
 
				+        filename = compute_quality_metrics(
			
 
				+            self.output_dir / "results.tsv",
			
 
				+            self.output_dir,
			
 
				+            self.tgt_lang,
			
 
				+            self.task,
			
 
				+            self.device,
			
 
				+            self.whisper_model_name,
			
 
				+            self.whisper_normalize_text_output,
			
 
				+            self.ref_text_col_name,
			
 
				+            self.pred_text_col_name if self.task in ["ASR", "S2TT", "T2TT"] else None,
			
 
				+            self.pred_audio_col_name,
			
 
				+        )
			
 
				+
			
 
				+        with open(self.output_dir / filename, "r") as f:
			
 
				+            corpus_metric_score = json.load(f)["score"]
			
 
				+
			
 
				+        return corpus_metric_score  # type: ignore[no-any-return]
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def add_args(parser: ArgumentParser) -> None:
			
 
				+        try:
			
 
				+            parser.add_argument(
			
 
				+                "--task", type=str, help="Task to evaluate", required=True
			
 
				+            )
			
 
				+            parser.add_argument(
			
 
				+                "--tgt-lang",
			
 
				+                type=str,
			
 
				+                help="Target language to translate/transcribe into.",
			
 
				+                required=True,
			
 
				+            )
			
 
				+        except:
			
 
				+            pass
			
 
				+
			
 
				+        parser.add_argument(
			
 
				+            "--whisper-model-name", type=str, help="Whisper model name", default="large"
			
 
				+        )
			
 
				+        parser.add_argument(
			
 
				+            "--whisper-normalize-text-output",
			
 
				+            action="store_true",
			
 
				+            help="Normalize text output",
			
 
				+            default=None,
			
 
				+        )
			
 
				+        parser.add_argument(
			
 
				+            "--ref-text-col-name",
			
 
				+            type=str,
			
 
				+            help="Reference text column name",
			
 
				+            default="ref_tgt_text",
			
 
				+        )
			
 
				+        parser.add_argument(
			
 
				+            "--pred-text-col-name",
			
 
				+            type=str,
			
 
				+            help="Prediction text column name",
			
 
				+            default="pred_tgt_text",
			
 
				+        )
			
 
				+        parser.add_argument(
			
 
				+            "--pred-audio-col-name",
			
 
				+            type=str,
			
 
				+            help="Prediction audio column name",
			
 
				+            default="pred_tgt_audio",
			
 
				+        )
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_args(cls, args: Namespace) -> SeamlessQualityScorer:
			
 
				+        return cls(
			
 
				+            tgt_lang=args.tgt_lang,
			
 
				+            task=args.task,
			
 
				+            output_dir=args.output,
			
 
				+            device=getattr(args, "device", "cpu"),
			
 
				+            whisper_model_name=args.whisper_model_name,
			
 
				+            whisper_normalize_text_output=args.whisper_normalize_text_output,
			
 
				+            ref_text_col_name=args.ref_text_col_name,
			
 
				+            pred_text_col_name=args.pred_text_col_name,
			
 
				+            pred_audio_col_name=args.pred_audio_col_name,
			
 
				+        )
			
--- a/src/seamless_communication/cli/streaming/scorers/seamless_whisper_asr_bleu.py
+++ b/src/seamless_communication/cli/streaming/scorers/seamless_whisper_asr_bleu.py
@@ -1,84 +0,0 @@
 
				-# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				-# All rights reserved.
			
 
				-#
			
 
				-# This source code is licensed under the license found in the
			
 
				-# LICENSE file in the root directory of this source tree.
			
 
				-from __future__ import annotations
			
 
				-
			
 
				-from argparse import ArgumentParser, Namespace
			
 
				-from typing import Dict, List
			
 
				-
			
 
				-from sacrebleu.metrics.bleu import BLEU
			
 
				-from seamless_communication.cli.eval_utils import LANG3_LANG2, get_tokenizer
			
 
				-from simuleval.evaluator.instance import LogInstance
			
 
				-from simuleval.evaluator.scorers.quality_scorer import (
			
 
				-    WhisperASRSacreBLEUScorer,
			
 
				-    register_quality_scorer,
			
 
				-)
			
 
				-from whisper.normalizers import BasicTextNormalizer, EnglishTextNormalizer
			
 
				-
			
 
				-
			
 
				-def normalize_text_whisper(sentences: List[str], lang: str) -> List[str]:
			
 
				-    if lang in ["en", "eng"]:
			
 
				-        normalizer = EnglishTextNormalizer()
			
 
				-    else:
			
 
				-        normalizer = BasicTextNormalizer()
			
 
				-    normalized_sentences = []
			
 
				-    for text in sentences:
			
 
				-        normalized_sentences.append(normalizer(text))
			
 
				-    return normalized_sentences
			
 
				-
			
 
				-
			
 
				-@register_quality_scorer("SEAMLESS_WHISPER_ASR_BLEU")
			
 
				-class SeamlessWhisperASRSacreBLEUScorer(WhisperASRSacreBLEUScorer):  # type: ignore
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        tokenizer: str = "13a",
			
 
				-        target_lang: str = "en",
			
 
				-        model_size: str = "base",
			
 
				-        lowercase: bool = False,
			
 
				-        remove_punctuations: bool = False,
			
 
				-        normalize_asr_bleu_references: bool = False,
			
 
				-    ) -> None:
			
 
				-        super().__init__()
			
 
				-        self.tokenizer = tokenizer
			
 
				-        self.target_lang = target_lang
			
 
				-        self.model_size = model_size
			
 
				-        self.lowercase = lowercase
			
 
				-        self.remove_punctuations = remove_punctuations
			
 
				-        self.normalize_asr_bleu_references = normalize_asr_bleu_references
			
 
				-
			
 
				-    def __call__(self, instances: Dict[int, LogInstance]) -> float:
			
 
				-        transcripts = self.asr_transcribe(instances)
			
 
				-        references = [[ins.reference for ins in instances.values()]]
			
 
				-
			
 
				-        if self.normalize_asr_bleu_references:
			
 
				-            transcripts = normalize_text_whisper(transcripts, self.target_lang)
			
 
				-            references = [normalize_text_whisper(references[0], self.target_lang)]
			
 
				-
			
 
				-        score = (
			
 
				-            BLEU(tokenize=self.tokenizer).corpus_score(transcripts, references).score
			
 
				-        )
			
 
				-        return score  # type: ignore[no-any-return]
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def add_args(parser: ArgumentParser) -> None:
			
 
				-        WhisperASRSacreBLEUScorer.add_args(parser)
			
 
				-        parser.add_argument(
			
 
				-            "--normalize-asr-bleu-references",
			
 
				-            action="store_true",
			
 
				-            help="Normalize asr transcript and reference",
			
 
				-        )
			
 
				-
			
 
				-    @classmethod
			
 
				-    def from_args(cls, args: Namespace) -> SeamlessWhisperASRSacreBLEUScorer:
			
 
				-        sacrebleu_tokenizer = get_tokenizer(args.tgt_lang)
			
 
				-        tgt_lang_2ltr = LANG3_LANG2[args.tgt_lang]
			
 
				-        return cls(
			
 
				-            sacrebleu_tokenizer,
			
 
				-            tgt_lang_2ltr,
			
 
				-            args.whisper_model_size,
			
 
				-            args.transcript_lowercase,
			
 
				-            args.transcript_non_punctuation,
			
 
				-            args.normalize_asr_bleu_references,
			
 
				-        )