há 1 ano atrás · 521a374213
--- a/setup.py
+++ b/setup.py
@@ -25,6 +25,7 @@ setup(
 
				         "fairseq2==0.2.*",
			
 
				         "librosa",
			
 
				         "openai-whisper",
			
 
				+        "simuleval",
			
 
				         "soundfile",
			
 
				         "torchaudio",
			
 
				         "tqdm",
			
--- a/src/seamless_communication/cli/__init__.py
+++ b/src/seamless_communication/cli/__init__.py
@@ -0,0 +1,5 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
--- a/src/seamless_communication/cli/eval_utils/__init__.py
+++ b/src/seamless_communication/cli/eval_utils/__init__.py
@@ -0,0 +1,16 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+
			
 
				+from seamless_communication.cli.eval_utils.compute_metrics import (
			
 
				+    compute_quality_metrics as compute_quality_metrics,
			
 
				+)
			
 
				+from seamless_communication.cli.eval_utils.compute_metrics import (
			
 
				+    get_tokenizer as get_tokenizer,
			
 
				+)
			
 
				+from seamless_communication.cli.eval_utils.lang_mapping import (
			
 
				+    LANG2_LANG3 as LANG2_LANG3,
			
 
				+)
			
--- a/src/seamless_communication/cli/expressivity/evaluate/evaluate.py
+++ b/src/seamless_communication/cli/expressivity/evaluate/evaluate.py
@@ -7,24 +7,20 @@
 
				 import argparse
			
 
				 import contextlib
			
 
				 import logging
			
 
				-import subprocess
			
 
				 from dataclasses import dataclass
			
 
				 from pathlib import Path
			
 
				-from typing import Dict, List, Optional, Tuple
			
 
				+from typing import Optional
			
 
				 
			
 
				 import torch
			
 
				 import torchaudio
			
 
				-from fairseq2.assets import asset_store
			
 
				-from fairseq2.data import Collater, CString, DataPipeline, FileMapper
			
 
				+from fairseq2.data import Collater, DataPipeline, FileMapper
			
 
				 from fairseq2.data.audio import (
			
 
				     AudioDecoder,
			
 
				     WaveformToFbankConverter,
			
 
				     WaveformToFbankOutput,
			
 
				 )
			
 
				 from fairseq2.data.text import StrSplitter, TextTokenizer, read_text
			
 
				-from fairseq2.data.typing import PathLike, StringLike
			
 
				 from fairseq2.generation import SequenceGeneratorOptions
			
 
				-from fairseq2.nn.padding import get_seqs_and_padding_mask
			
 
				 from fairseq2.typing import DataType, Device
			
 
				 from sacrebleu.metrics import BLEU  # type: ignore[attr-defined]
			
 
				 from torch import Tensor
			
--- a/src/seamless_communication/cli/m4t/evaluate/evaluate.py
+++ b/src/seamless_communication/cli/m4t/evaluate/evaluate.py
@@ -25,7 +25,7 @@ from fairseq2.typing import DataType, Device
 
				 from torch import Tensor
			
 
				 from tqdm import tqdm
			
 
				 
			
 
				-from seamless_communication.cli.eval_utils.compute_metrics import (
			
 
				+from seamless_communication.cli.eval_utils import (
			
 
				     compute_quality_metrics,
			
 
				 )
			
 
				 from seamless_communication.cli.m4t.predict import (
			
@@ -368,6 +368,9 @@ def main(optional_args: Optional[Dict[str, Any]] = None) -> None:
 
				             "Please provide required arguments for evaluation - data_file, task, tgt_lang"
			
 
				         )
			
 
				 
			
 
				+    if not Path(args.data_file).exists():
			
 
				+        raise ValueError(f"Invalid data_file to be evaluated: {args.data_file}")
			
 
				+
			
 
				     input_modality, output_modality = Translator.get_modalities_from_task_str(args.task)
			
 
				 
			
 
				     if input_modality == Modality.SPEECH and not Path(args.audio_root_dir).exists():
			
--- a/src/seamless_communication/streaming/__init__.py
+++ b/src/seamless_communication/streaming/__init__.py
@@ -0,0 +1,5 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
--- a/src/seamless_communication/streaming/agents/__init__.py
+++ b/src/seamless_communication/streaming/agents/__init__.py
@@ -0,0 +1,9 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+from seamless_communication.streaming.agents.mma_m4t_s2t import (
			
 
				+    MonotonicM4TS2TSPMAgent as MonotonicM4TS2TSPMAgent,
			
 
				+)
			
--- a/src/seamless_communication/streaming/agents/mixins.py
+++ b/src/seamless_communication/streaming/agents/mixins.py
@@ -0,0 +1,18 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+"""
			
 
				+Mixins for fairseq2 simuleval agents
			
 
				+"""
			
 
				+
			
 
				+
			
 
				+class EarlyStoppingMixin:
			
 
				+    def reset_early(self) -> None:
			
 
				+        """
			
 
				+        Implement to override for different behavior on a reset that
			
 
				+        happens before EOS
			
 
				+        """
			
 
				+        raise NotImplementedError()
			
--- a/src/seamless_communication/streaming/agents/mma_m4t_s2t.py
+++ b/src/seamless_communication/streaming/agents/mma_m4t_s2t.py
@@ -0,0 +1,16 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+from seamless_communication.streaming.agents.online_feature_extractor import (
			
 
				+    OnlineFeatureExtractorAgent,
			
 
				+)
			
 
				+from seamless_communication.streaming.agents.unity_pipeline import UnitYAgentPipeline
			
 
				+from simuleval.utils import entrypoint
			
 
				+
			
 
				+
			
 
				+@entrypoint
			
 
				+class MonotonicM4TS2TSPMAgent(UnitYAgentPipeline):
			
 
				+    pipeline = [OnlineFeatureExtractorAgent]
			
--- a/src/seamless_communication/streaming/agents/online_feature_extractor.py
+++ b/src/seamless_communication/streaming/agents/online_feature_extractor.py
@@ -0,0 +1,152 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+import math
			
 
				+import torch
			
 
				+
			
 
				+from argparse import ArgumentParser, Namespace
			
 
				+from typing import Any, List
			
 
				+
			
 
				+from fairseq2.data.audio import WaveformToFbankConverter, WaveformToFbankInput
			
 
				+
			
 
				+from simuleval.agents import SpeechToSpeechAgent
			
 
				+from simuleval.agents.actions import Action, ReadAction, WriteAction
			
 
				+from simuleval.agents.states import AgentStates
			
 
				+from simuleval.data.segments import Segment, SpeechSegment
			
 
				+
			
 
				+
			
 
				+SHIFT_SIZE = 10
			
 
				+WINDOW_SIZE = 25
			
 
				+SAMPLE_RATE = 16000
			
 
				+FEATURE_DIM = 80
			
 
				+
			
 
				+
			
 
				+class FeatureStates(AgentStates):
			
 
				+    def reset(self) -> None:
			
 
				+        super().reset()
			
 
				+        self.previous_residual_samples: List[float] = []
			
 
				+        self.tgt_lang = None
			
 
				+
			
 
				+    def update_source(self, segment: Segment) -> None:
			
 
				+        """
			
 
				+        Update states from input segment
			
 
				+        Args:
			
 
				+            segment (~simuleval.agents.segments.Segment): input segment
			
 
				+        """
			
 
				+        self.source_finished = segment.finished
			
 
				+        if self.tgt_lang is None and segment.tgt_lang is not None:
			
 
				+            self.tgt_lang = segment.tgt_lang
			
 
				+        if not segment.is_empty:
			
 
				+            self.source.append(segment.content)
			
 
				+
			
 
				+
			
 
				+class OnlineFeatureExtractorAgent(SpeechToSpeechAgent):
			
 
				+    """
			
 
				+    Extract speech features on the fly.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, args: Namespace):
			
 
				+        super().__init__(args)
			
 
				+        self.shift_size = args.shift_size
			
 
				+        self.window_size = args.window_size
			
 
				+        assert self.window_size >= self.shift_size
			
 
				+
			
 
				+        self.sample_rate = args.sample_rate
			
 
				+        self.feature_dim = args.feature_dim
			
 
				+        self.num_samples_per_shift = int(self.shift_size * self.sample_rate / 1000)
			
 
				+        self.num_samples_per_window = int(self.window_size * self.sample_rate / 1000)
			
 
				+        self.len_ms_to_samples = lambda x: x * self.sample_rate / 1000
			
 
				+
			
 
				+        self.convert_to_fbank = WaveformToFbankConverter(
			
 
				+            num_mel_bins=80,
			
 
				+            waveform_scale=2**15 if args.denormalize else 1.0,
			
 
				+            standardize=False,
			
 
				+            device=args.device,
			
 
				+            dtype=args.dtype,
			
 
				+        )
			
 
				+
			
 
				+    def build_states(self) -> FeatureStates:
			
 
				+        return FeatureStates()
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def add_args(parser: ArgumentParser) -> None:
			
 
				+        parser.add_argument(
			
 
				+            "--shift-size",
			
 
				+            type=int,
			
 
				+            default=SHIFT_SIZE,
			
 
				+            help="Shift size of feature extraction window.",
			
 
				+        )
			
 
				+        parser.add_argument(
			
 
				+            "--window-size",
			
 
				+            type=int,
			
 
				+            default=WINDOW_SIZE,
			
 
				+            help="Window size of feature extraction window.",
			
 
				+        )
			
 
				+        parser.add_argument(
			
 
				+            "--feature-dim",
			
 
				+            type=int,
			
 
				+            default=FEATURE_DIM,
			
 
				+            help="Acoustic feature dimension.",
			
 
				+        )
			
 
				+        parser.add_argument(
			
 
				+            "--denormalize",
			
 
				+            action="store_true",
			
 
				+            help="denormalized to 16-bit signed integers",
			
 
				+        )
			
 
				+
			
 
				+    def policy(self, states: FeatureStates) -> Action:
			
 
				+        if len(states.source) == 0:
			
 
				+            if states.source_finished:
			
 
				+                return WriteAction({}, finished=states.source_finished)
			
 
				+            else:
			
 
				+                return ReadAction()
			
 
				+
			
 
				+        samples = states.source[-1]
			
 
				+
			
 
				+        samples = states.previous_residual_samples + samples
			
 
				+        if len(samples) < self.num_samples_per_window:
			
 
				+            states.previous_residual_samples = samples
			
 
				+            return ReadAction()
			
 
				+
			
 
				+        # num_frames is the number of frames from the new segment
			
 
				+        num_frames = math.floor(
			
 
				+            (len(samples) - self.len_ms_to_samples(self.window_size - self.shift_size))
			
 
				+            / self.num_samples_per_shift
			
 
				+        )
			
 
				+
			
 
				+        # the number of frames used for feature extraction
			
 
				+        # including some part of the previous segment
			
 
				+        effective_num_samples = int(
			
 
				+            num_frames * self.len_ms_to_samples(self.shift_size)
			
 
				+            + self.len_ms_to_samples(self.window_size - self.shift_size)
			
 
				+        )
			
 
				+
			
 
				+        input_samples = samples[:effective_num_samples]
			
 
				+        states.previous_residual_samples = samples[
			
 
				+            num_frames * self.num_samples_per_shift :
			
 
				+        ]
			
 
				+
			
 
				+        data: WaveformToFbankInput = {
			
 
				+            "waveform": torch.tensor(input_samples).unsqueeze(0),
			
 
				+            "sample_rate": self.sample_rate,
			
 
				+        }
			
 
				+
			
 
				+        output = self.convert_to_fbank(data)["fbank"]
			
 
				+
			
 
				+        return WriteAction(
			
 
				+            SpeechSegment(
			
 
				+                content=output,
			
 
				+                tgt_lang=states.tgt_lang,
			
 
				+                finished=states.source_finished,
			
 
				+            ),
			
 
				+            finished=states.source_finished,
			
 
				+        )
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_args(cls, args: Any, **kwargs: Any) -> OnlineFeatureExtractorAgent:
			
 
				+        return cls(args)
			
--- a/src/seamless_communication/streaming/agents/unity_pipeline.py
+++ b/src/seamless_communication/streaming/agents/unity_pipeline.py
@@ -0,0 +1,169 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+from __future__ import annotations
			
 
				+from simuleval.agents.agent import GenericAgent
			
 
				+
			
 
				+import logging
			
 
				+import torch
			
 
				+
			
 
				+from argparse import ArgumentParser, Namespace
			
 
				+from typing import Any, List, Optional
			
 
				+
			
 
				+from fairseq2.assets import asset_store
			
 
				+from seamless_communication.streaming.agents.mixins import EarlyStoppingMixin
			
 
				+from seamless_communication.inference.translator import Modality, Translator
			
 
				+from seamless_communication.models.unity import (
			
 
				+    load_unity_config,
			
 
				+    load_unity_model,
			
 
				+    load_unity_text_tokenizer,
			
 
				+    load_unity_unit_tokenizer,
			
 
				+)
			
 
				+from seamless_communication.models.monotonic_decoder import load_monotonic_decoder_model
			
 
				+
			
 
				+from simuleval.agents import AgentPipeline, AgentStates
			
 
				+from simuleval.data.segments import Segment
			
 
				+
			
 
				+
			
 
				+logging.basicConfig(
			
 
				+    level=logging.INFO,
			
 
				+    format="%(asctime)s %(levelname)s -- %(name)s: %(message)s",
			
 
				+)
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+def maybe_reset_states(states: Optional[List[Optional[AgentStates]]]) -> None:
			
 
				+    for s in states:
			
 
				+        if s is not None:
			
 
				+            if isinstance(s, EarlyStoppingMixin):
			
 
				+                s.reset_early()
			
 
				+            else:
			
 
				+                s.reset()
			
 
				+
			
 
				+
			
 
				+class UnitYPipelineMixin:
			
 
				+    """
			
 
				+    Mixin for fairseq pipeline which works with both AgentPipeline
			
 
				+    and TreeAgentPipeline
			
 
				+    """
			
 
				+
			
 
				+    @classmethod
			
 
				+    def add_args(cls, parser: ArgumentParser) -> None:
			
 
				+        super().add_args(parser)
			
 
				+        parser.add_argument("--task", type=str, help="Task type")
			
 
				+        parser.add_argument(
			
 
				+            "--unity-model-name",
			
 
				+            type=str,
			
 
				+            help="Unity model name.",
			
 
				+            default="unity_sans_decoder",
			
 
				+        )
			
 
				+        parser.add_argument(
			
 
				+            "--monotonic-decoder-model-name",
			
 
				+            type=str,
			
 
				+            help="Monotonic decoder model name.",
			
 
				+            default="monotonic_decoder",
			
 
				+        )
			
 
				+        parser.add_argument(
			
 
				+            "--sample-rate",
			
 
				+            default=16000,
			
 
				+            type=float,
			
 
				+        )
			
 
				+        parser.add_argument(
			
 
				+            "--dtype",
			
 
				+            default="fp16",
			
 
				+            type=str,
			
 
				+        )
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_args(cls, args: Any) -> UnitYPipelineMixin:
			
 
				+        return cls(args)
			
 
				+
			
 
				+
			
 
				+class UnitYAgentPipeline(UnitYPipelineMixin, AgentPipeline):
			
 
				+    pipeline: List[GenericAgent] = []
			
 
				+
			
 
				+    def __init__(self, args: Namespace):
			
 
				+
			
 
				+        if not torch.cuda.is_available() and "cuda" in args.device:
			
 
				+            raise ValueError("CUDA not available, use CPU.")
			
 
				+
			
 
				+        args.device = torch.device(args.device)
			
 
				+        if (args.fp16 or args.dtype == "fp16") and args.device != torch.device("cpu"):
			
 
				+            args.dtype = torch.float16
			
 
				+        else:
			
 
				+            args.dtype = torch.float32
			
 
				+
			
 
				+        input_modality, output_modality = Translator.get_modalities_from_task_str(
			
 
				+            args.task
			
 
				+        )
			
 
				+
			
 
				+        if input_modality != Modality.SPEECH:
			
 
				+            raise ValueError("`UnitYAgentPipeline` only supports speech input.")
			
 
				+
			
 
				+        unity_config = load_unity_config(args.unity_model_name)
			
 
				+        unity_config.use_text_decoder = False
			
 
				+        unity_config.use_text_encoder = False
			
 
				+
			
 
				+        text_tokenizer = load_unity_text_tokenizer(args.unity_model_name)
			
 
				+
			
 
				+        # Skip loading the T2U model.
			
 
				+        if output_modality == Modality.TEXT:
			
 
				+            unity_config.t2u_config = None
			
 
				+            unit_tokenizer = None
			
 
				+        else:
			
 
				+            unit_tokenizer = load_unity_unit_tokenizer(args.unity_model_name)
			
 
				+
			
 
				+        asset_card = asset_store.retrieve_card(args.unity_model_name)
			
 
				+        asset_card.field("model_config").set(unity_config)
			
 
				+
			
 
				+        logger.info(
			
 
				+            f"Loading the UnitY model: {args.unity_model_name} on device={args.device}, dtype={args.dtype}"
			
 
				+        )
			
 
				+        unity_model = load_unity_model(asset_card, device=args.device, dtype=args.dtype)
			
 
				+        unity_model.eval()
			
 
				+
			
 
				+        logger.info(
			
 
				+            f"Loading the Monotonic Decoder model: {args.monotonic_decoder_model_name} on device={args.device}, dtype={args.dtype}"
			
 
				+        )
			
 
				+        monotonic_decoder_model = load_monotonic_decoder_model(
			
 
				+            args.monotonic_decoder_model_name, device=args.device, dtype=args.dtype
			
 
				+        )
			
 
				+        monotonic_decoder_model.eval()
			
 
				+
			
 
				+        module_list = []
			
 
				+        for p in self.pipeline:
			
 
				+            module_list.append(
			
 
				+                p.from_args(
			
 
				+                    args,
			
 
				+                    unity_model=unity_model,
			
 
				+                    unity_config=unity_config,
			
 
				+                    monotonic_decoder_model=monotonic_decoder_model,
			
 
				+                    text_tokenizer=text_tokenizer,
			
 
				+                    unit_tokenizer=unit_tokenizer,
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				+        super().__init__(module_list)
			
 
				+
			
 
				+    def pop(self, states: Optional[List[Optional[AgentStates]]] = None) -> Segment:
			
 
				+        output_segment = super().pop(states)
			
 
				+        if states is None:
			
 
				+            # Not stateless
			
 
				+            first_states = self.module_list[0].states
			
 
				+        else:
			
 
				+            assert len(states) == len(self.module_list)
			
 
				+            first_states = states[0]
			
 
				+
			
 
				+        if not first_states.source_finished and output_segment.finished:
			
 
				+            # An early stop.
			
 
				+            # The temporary solution is to start over
			
 
				+            if states is not None:
			
 
				+                maybe_reset_states(states)
			
 
				+            else:
			
 
				+                self.reset()
			
 
				+            output_segment.finished = False
			
 
				+
			
 
				+        return output_segment
			
--- a/src/seamless_communication/streaming/dataloaders/__init__.py
+++ b/src/seamless_communication/streaming/dataloaders/__init__.py
@@ -0,0 +1,9 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+from seamless_communication.streaming.dataloaders.s2tt import (
			
 
				+    SimulEvalSpeechToTextDataloader as SimulEvalSpeechToTextDataloader,
			
 
				+)
			
--- a/src/seamless_communication/streaming/dataloaders/s2tt.py
+++ b/src/seamless_communication/streaming/dataloaders/s2tt.py
@@ -0,0 +1,169 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+import subprocess
			
 
				+from argparse import ArgumentParser, Namespace
			
 
				+from dataclasses import dataclass
			
 
				+from pathlib import Path
			
 
				+from typing import List, Optional
			
 
				+
			
 
				+import torch
			
 
				+import torch.nn.functional as F
			
 
				+from fairseq2.data.audio import AudioDecoder
			
 
				+from fairseq2.data.data_pipeline import Collater, DataPipeline, FileMapper
			
 
				+from fairseq2.data.text.converters import StrSplitter
			
 
				+from fairseq2.data.text.text_reader import read_text
			
 
				+from simuleval.data.dataloader import register_dataloader
			
 
				+from simuleval.data.dataloader.dataloader import IterableDataloader
			
 
				+from simuleval.data.dataloader.s2t_dataloader import SpeechToTextDataloader
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class SoundFileInfo:
			
 
				+    samplerate: float
			
 
				+    path: str
			
 
				+
			
 
				+    def __repr__(self) -> str:
			
 
				+        return "\n".join([f"samplerate: {str(self.samplerate)}", f"path: {self.path}"])
			
 
				+
			
 
				+
			
 
				+def count_lines(filename: Path) -> int:
			
 
				+    result = subprocess.run(["wc", "-l", filename], stdout=subprocess.PIPE)
			
 
				+    return int(result.stdout.decode().split()[0]) - 1
			
 
				+
			
 
				+
			
 
				+@register_dataloader("fairseq2_s2tt")
			
 
				+class SimulEvalSpeechToTextDataloader(SpeechToTextDataloader, IterableDataloader):
			
 
				+    def __init__(self, data_pipeline: DataPipeline, args: Namespace) -> None:
			
 
				+        self.args = args
			
 
				+        self.data_file: Path = Path(getattr(self.args, "data_file", ""))
			
 
				+        if not self.data_file.exists():
			
 
				+            raise ValueError(f"data_file: {self.data_file} does not exist.")
			
 
				+        self.start_index: int = getattr(self.args, "start_index", 0)
			
 
				+        self.end_index: int = getattr(self.args, "end_index", -1)
			
 
				+        self.data_pipeline = data_pipeline
			
 
				+        self.data_itr = iter(self.data_pipeline)
			
 
				+        self.cur_index = self.start_index - 1
			
 
				+        self.item = None
			
 
				+
			
 
				+    def __iter__(self) -> SimulEvalSpeechToTextDataloader:
			
 
				+        return self
			
 
				+
			
 
				+    def __next__(self) -> SimulEvalSpeechToTextDataloader:
			
 
				+        if self.cur_index >= self.end_index - 1:
			
 
				+            raise StopIteration
			
 
				+        self.item = next(self.data_itr)
			
 
				+        self.cur_index += 1
			
 
				+        return self
			
 
				+
			
 
				+    def reset(self) -> None:
			
 
				+        self.cur_index = 0
			
 
				+        self.data_pipeline.reset()
			
 
				+
			
 
				+    def __len__(self) -> int:
			
 
				+        if self.end_index > 0:
			
 
				+            return self.end_index - self.start_index
			
 
				+        self.end_index = count_lines(self.data_file)
			
 
				+        return self.end_index - self.start_index
			
 
				+
			
 
				+    def get_source(self, index: Optional[int] = None) -> List[float]:
			
 
				+        source: List[float] = (
			
 
				+            self.item["audio"]["data"]["waveform"]["seqs"].squeeze().tolist()
			
 
				+        )
			
 
				+        return source
			
 
				+
			
 
				+    def get_target(self, index: Optional[int] = None) -> str:
			
 
				+        return str(self.item[self.args.ref_field][0])
			
 
				+
			
 
				+    def get_tgt_lang(self, index: Optional[int] = None) -> Optional[str]:
			
 
				+        if self.args.tgt_lang:
			
 
				+            tgt_lang: str = self.args.tgt_lang
			
 
				+            return tgt_lang
			
 
				+
			
 
				+        tgt_lang = self.item.get("tgt_lang")
			
 
				+        return str(tgt_lang[0]) if tgt_lang else None
			
 
				+
			
 
				+    def get_source_audio_info(self, index: Optional[int] = None) -> SoundFileInfo:
			
 
				+        samplerate = self.item["audio"]["data"]["sample_rate"][0]
			
 
				+        path = f'{self.args.audio_root_dir}/{str(self.item["audio"]["path"][0])}'
			
 
				+        return SoundFileInfo(samplerate, path)
			
 
				+
			
 
				+    def get_source_audio_path(self, index: Optional[int] = None) -> str:
			
 
				+        return str(self.item["audio"]["path"][0])
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_args(cls, args: Namespace) -> SimulEvalSpeechToTextDataloader:
			
 
				+        with open(args.data_file, "r") as f:
			
 
				+            header = f.readline().strip("\n").split("\t")
			
 
				+
			
 
				+        split_tsv = StrSplitter(names=header)
			
 
				+
			
 
				+        start_index: int = getattr(args, "start_index", 0)
			
 
				+
			
 
				+        pipeline_builder = (
			
 
				+            read_text(args.data_file, rtrim=True).skip(1 + start_index).map(split_tsv)
			
 
				+        )
			
 
				+
			
 
				+        map_file = FileMapper(root_dir=args.audio_root_dir, cached_fd_count=10)
			
 
				+
			
 
				+        pipeline_builder.map(map_file, selector="audio")
			
 
				+
			
 
				+        device = getattr(args, "device", None)
			
 
				+        assert device is not None
			
 
				+
			
 
				+        decode_audio = AudioDecoder(dtype=torch.float32, device=torch.device(device))
			
 
				+
			
 
				+        pipeline_builder.map(
			
 
				+            decode_audio,
			
 
				+            selector="audio.data",
			
 
				+        )
			
 
				+
			
 
				+        pipeline_builder.map(
			
 
				+            lambda x: F.layer_norm(x, x.shape),
			
 
				+            selector="audio.data.waveform",
			
 
				+        )
			
 
				+
			
 
				+        collate = Collater(pad_value=0, pad_to_multiple=1)
			
 
				+
			
 
				+        pipeline_builder.map(collate)
			
 
				+
			
 
				+        pipeline_builder.prefetch(1)
			
 
				+
			
 
				+        data_pipeline = pipeline_builder.and_return()
			
 
				+
			
 
				+        return cls(data_pipeline, args)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def add_args(parser: ArgumentParser) -> None:
			
 
				+        parser.add_argument(
			
 
				+            "--data-file",
			
 
				+            type=str,
			
 
				+            required=True,
			
 
				+            help="Data file (.tsv) to be evaluated.",
			
 
				+        )
			
 
				+        parser.add_argument(
			
 
				+            "--audio-root-dir",
			
 
				+            type=str,
			
 
				+            help="Root directory for the audio filenames in the data file.",
			
 
				+            default="",
			
 
				+        )
			
 
				+        parser.add_argument(
			
 
				+            "--ref-field",
			
 
				+            type=str,
			
 
				+            help="Reference target text field to compute the BLEU score against.",
			
 
				+            default="tgt_text",
			
 
				+        )
			
 
				+        parser.add_argument(
			
 
				+            "--source-segment-size",
			
 
				+            type=int,
			
 
				+            default=1,
			
 
				+            help="Source segment size, For text the unit is # token, for speech is ms",
			
 
				+        )
			
 
				+        parser.add_argument(
			
 
				+            "--tgt-lang", type=str, help="Target language to translate/transcribe into."
			
 
				+        )