1 year ago · 75ed7ef2db
--- a/src/seamless_communication/inference/__init__.py
+++ b/src/seamless_communication/inference/__init__.py
@@ -14,3 +14,5 @@ from seamless_communication.inference.translator import (
 
															 from seamless_communication.inference.translator import Modality as Modality
														
 
															 from seamless_communication.inference.translator import Task as Task
														
 
															 from seamless_communication.inference.translator import Translator as Translator
														
 
															+
														
 
															+from seamless_communication.inference.transcriber import Transcriber as Transcriber
														
--- a/src/seamless_communication/inference/transcriber.py
+++ b/src/seamless_communication/inference/transcriber.py
@@ -0,0 +1,325 @@
 
															+# Copyright (c) Meta Platforms, Inc. and affiliates.
														
 
															+# All rights reserved.
														
 
															+# This source code is licensed under the license found in the
														
 
															+# MIT_LICENSE file in the root directory of this source tree.
														
 
															+
														
 
															+from pathlib import Path
														
 
															+from typing import Any, Callable, Dict, List, Tuple, Union
														
 
															+
														
 
															+from fairseq2.assets.card import AssetCard
														
 
															+from fairseq2.data import Collater
														
 
															+from fairseq2.data.audio import AudioDecoder, WaveformToFbankConverter
														
 
															+from fairseq2.generation import (
														
 
															+    BeamSearchSeq2SeqGenerator,
														
 
															+    SequenceGeneratorOutput,
														
 
															+)
														
 
															+from fairseq2.memory import MemoryBlock
														
 
															+from fairseq2.nn.transformer.multihead_attention import AttentionWeightHook
														
 
															+from fairseq2.typing import DataType, Device
														
 
															+
														
 
															+import numpy as np
														
 
															+from scipy.signal import medfilt2d
														
 
															+
														
 
															+import torch
														
 
															+import torch.nn as nn
														
 
															+from torch import Tensor
														
 
															+
														
 
															+from seamless_communication.models.unity import (
														
 
															+    UnitYX2TModel,
														
 
															+    load_unity_model,
														
 
															+    load_unity_text_tokenizer,
														
 
															+)
														
 
															+
														
 
															+
														
 
															+class EncDecAttentionsCollect(AttentionWeightHook):
														
 
															+    def __init__(self):
														
 
															+        super().__init__()
														
 
															+        self.attn_scores = []
														
 
															+
														
 
															+    def __call__(self, m, attn, attn_weights) -> None:
														
 
															+        if attn_weights.shape[-2] > 1:
														
 
															+            val = torch.clone(attn_weights).detach().sum(dim=0).squeeze(0).tolist()
														
 
															+            self.attn_scores.extend(val)
														
 
															+        else:
														
 
															+            val = (
														
 
															+                torch.clone(attn_weights)
														
 
															+                .detach()
														
 
															+                .sum(dim=0)
														
 
															+                .sum(dim=0)
														
 
															+                .squeeze(0)
														
 
															+                .tolist()
														
 
															+            )
														
 
															+            self.attn_scores.append(val)
														
 
															+
														
 
															+    def reset(self):
														
 
															+        self.attn_scores = []
														
 
															+
														
 
															+
														
 
															+class TranscriptionToken:
														
 
															+    text: str
														
 
															+    time_s: float
														
 
															+    prob: float
														
 
															+
														
 
															+    def __init__(self, text: str, time_s: float, prob: float):
														
 
															+        self.text = text
														
 
															+        self.time_s = time_s
														
 
															+        self.prob = prob
														
 
															+
														
 
															+
														
 
															+class Transcription:
														
 
															+    text: str
														
 
															+    tokens: List[TranscriptionToken]
														
 
															+
														
 
															+    def __init__(self, tokens: List[TranscriptionToken]):
														
 
															+        self.text = " ".join([t.text for t in tokens])
														
 
															+        self.tokens = tokens
														
 
															+
														
 
															+    def __str__(self):
														
 
															+        return self.text
														
 
															+
														
 
															+    def __repr__(self):
														
 
															+        return self.text
														
 
															+
														
 
															+
														
 
															+class Transcriber(nn.Module):
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        model_name_or_card: Union[str, AssetCard],
														
 
															+        device: torch.device = torch.device("cpu"),
														
 
															+        dtype: torch.dtype = torch.float32,
														
 
															+        encoder_layers: int = 6,
														
 
															+        decoder_layers: int = 3,
														
 
															+        embed_dim: int = 512,
														
 
															+        depthwise_conv_kernel_size: int = 31,
														
 
															+    ):
														
 
															+        super().__init__()
														
 
															+        self.device = device
														
 
															+        self.dtype = dtype
														
 
															+        self.embed_dim = embed_dim
														
 
															+        self.encoder_layers = encoder_layers
														
 
															+        self.decoder_layers = decoder_layers
														
 
															+        self.depthwise_conv_kernel_size = depthwise_conv_kernel_size
														
 
															+        self.tokenizer = load_unity_text_tokenizer(model_name_or_card)
														
 
															+        self.decoder_vocab_info = self.tokenizer.vocab_info
														
 
															+        self.langs = self.tokenizer.langs
														
 
															+
														
 
															+        model = self.load_model_for_inference(
														
 
															+            load_unity_model, model_name_or_card, device, dtype
														
 
															+        )
														
 
															+        self.s2t = UnitYX2TModel(
														
 
															+            encoder_frontend=model.speech_encoder_frontend,
														
 
															+            encoder=model.speech_encoder,
														
 
															+            decoder_frontend=model.text_decoder_frontend,
														
 
															+            decoder=model.text_decoder,
														
 
															+            final_proj=model.final_proj,
														
 
															+            target_vocab_info=self.decoder_vocab_info,
														
 
															+        )
														
 
															+        self.enc_dec_attn_collector = EncDecAttentionsCollect()
														
 
															+        self.s2t.decoder.layers[-1].encoder_decoder_attn.register_attn_weight_hook(
														
 
															+            self.enc_dec_attn_collector
														
 
															+        )
														
 
															+
														
 
															+        self.decode_audio = AudioDecoder(dtype=torch.float32, device=device)
														
 
															+        self.convert_to_fbank = WaveformToFbankConverter(
														
 
															+            num_mel_bins=80,
														
 
															+            waveform_scale=2**15,
														
 
															+            channel_last=True,
														
 
															+            standardize=True,
														
 
															+            device=device,
														
 
															+            dtype=dtype,
														
 
															+        )
														
 
															+        self.collate = Collater(
														
 
															+            pad_value=self.tokenizer.vocab_info.pad_idx, pad_to_multiple=2
														
 
															+        )
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def load_model_for_inference(
														
 
															+        load_model_fn: Callable[..., nn.Module],
														
 
															+        model_name_or_card: Union[str, AssetCard],
														
 
															+        device: Device,
														
 
															+        dtype: DataType,
														
 
															+    ) -> nn.Module:
														
 
															+        model = load_model_fn(model_name_or_card, device=device, dtype=dtype)
														
 
															+        model.eval()
														
 
															+        return model
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def generate_lis(arr: List[Tuple[int, int]]) -> Tuple[int, List[Tuple[int, int]]]:
														
 
															+        n = len(arr)
														
 
															+        lis = [1] * n
														
 
															+        prev = [0] * n
														
 
															+        for i in range(0, n):
														
 
															+            prev[i] = i
														
 
															+        for i in range(1, n):
														
 
															+            for j in range(0, i):
														
 
															+                if arr[i] > arr[j] and lis[i] < lis[j] + 1:
														
 
															+                    lis[i] = lis[j] + 1
														
 
															+                    prev[i] = j
														
 
															+        maximum = 0
														
 
															+        idx = 0
														
 
															+        for i in range(n):
														
 
															+            if maximum < lis[i]:
														
 
															+                maximum = lis[i]
														
 
															+                idx = i
														
 
															+        seq = [arr[idx]]
														
 
															+        while idx != prev[idx]:
														
 
															+            idx = prev[idx]
														
 
															+            seq.append(arr[idx])
														
 
															+        return (maximum, reversed(seq))
														
 
															+
														
 
															+    @classmethod
														
 
															+    def _extract_timestamps(
														
 
															+        cls,
														
 
															+        attn_weights,
														
 
															+        audio_len,
														
 
															+        filter_width,
														
 
															+    ) -> List[float]:
														
 
															+        attn_weights = [attn_line[1:-1] for attn_line in attn_weights][1:]
														
 
															+
														
 
															+        num_out_tokens = len(attn_weights)
														
 
															+        num_encoder_steps = len(attn_weights[0])
														
 
															+        attn_weights = np.array(attn_weights)
														
 
															+        attn_weights = attn_weights / attn_weights.sum(axis=0, keepdims=1)  # normalize
														
 
															+        attn_weights = medfilt2d(attn_weights, kernel_size=(filter_width, filter_width))
														
 
															+
														
 
															+        # find timestamps using longest increasing subsequence algo
														
 
															+        col_maxes = np.argmax(attn_weights, axis=0)
														
 
															+        lis_input = [
														
 
															+            (out_tok_idx, -enc_bin_idx)
														
 
															+            for enc_bin_idx, out_tok_idx in enumerate(col_maxes)
														
 
															+        ]
														
 
															+        tok_idx_to_start_enc_bin_idx = {
														
 
															+            out_tok_idx: -enc_bin_idx
														
 
															+            for out_tok_idx, enc_bin_idx in cls.generate_lis(lis_input)[1]
														
 
															+        }
														
 
															+        prev_start = 0
														
 
															+        starts = []
														
 
															+        for tok_idx in range(num_out_tokens):
														
 
															+            start_enc_bin_idx = tok_idx_to_start_enc_bin_idx.get(tok_idx, prev_start)
														
 
															+            starts.append(start_enc_bin_idx)
														
 
															+            prev_start = start_enc_bin_idx
														
 
															+        seconds_per_enc_pos = audio_len / num_encoder_steps
														
 
															+        start_times = [seconds_per_enc_pos * start_pos for start_pos in starts]
														
 
															+        return start_times
														
 
															+
														
 
															+    @classmethod
														
 
															+    def _collect_word_level_stats(
														
 
															+        cls, pieces: List[str], token_timestamps: List[float], step_scores: List[float]
														
 
															+    ) -> List[TranscriptionToken]:
														
 
															+        assert len(pieces) == len(token_timestamps) and len(token_timestamps) == len(
														
 
															+            step_scores
														
 
															+        )
														
 
															+        word_stats: List[List[Any]] = []
														
 
															+        for (
														
 
															+            time_s,
														
 
															+            token,
														
 
															+            score,
														
 
															+        ) in zip(token_timestamps, pieces, step_scores):
														
 
															+            if not word_stats or token.startswith("▁") and time_s > word_stats[-1][1]:
														
 
															+                word_stats.append(
														
 
															+                    [token.replace("▁", " ").strip(), time_s, [np.exp(score)]]
														
 
															+                )
														
 
															+            else:
														
 
															+                word_stats[-1][0] += token.replace("▁", " ")
														
 
															+                word_stats[-1][2].append(np.exp(score))
														
 
															+        word_stats = [
														
 
															+            TranscriptionToken(word, start, np.mean(probs))
														
 
															+            for word, start, probs in word_stats
														
 
															+        ]
														
 
															+        return word_stats
														
 
															+
														
 
															+    def run_inference(
														
 
															+        self,
														
 
															+        fbanks: torch.Tensor,
														
 
															+        src_lang: str,
														
 
															+        length_seconds: float,
														
 
															+        filter_width: int,
														
 
															+        gen_opts: Dict,
														
 
															+    ) -> Transcription:
														
 
															+        prefix = self.tokenizer.create_encoder(
														
 
															+            mode="target", lang=src_lang
														
 
															+        ).prefix_indices
														
 
															+        beam_size = gen_opts.get("beam_size") or 1  # set to 1 by default
														
 
															+        gen_opts.pop("beam_size", None)
														
 
															+        generator = BeamSearchSeq2SeqGenerator(
														
 
															+            model=self.s2t,
														
 
															+            beam_size=beam_size,
														
 
															+            **gen_opts,
														
 
															+        )
														
 
															+
														
 
															+        self.enc_dec_attn_collector.reset()
														
 
															+        output: SequenceGeneratorOutput = generator(
														
 
															+            source_seqs=fbanks.unsqueeze(0),
														
 
															+            source_padding_mask=None,
														
 
															+            prompt_seqs=prefix.unsqueeze(0),
														
 
															+            prompt_padding_mask=None,
														
 
															+        )
														
 
															+
														
 
															+        token_ids = output.hypotheses[0][0].seq.squeeze(0).tolist()[:-1]
														
 
															+        step_scores = output.hypotheses[0][0].step_scores.tolist()[:-1]
														
 
															+        enc_dec_attn_scores = self.enc_dec_attn_collector.attn_scores[:-1]
														
 
															+        token_timestamps = self._extract_timestamps(
														
 
															+            enc_dec_attn_scores,
														
 
															+            length_seconds,
														
 
															+            filter_width,
														
 
															+        )
														
 
															+        pieces = [
														
 
															+            self.tokenizer.model.index_to_token(token_id) for token_id in token_ids
														
 
															+        ]
														
 
															+        stats = self._collect_word_level_stats(
														
 
															+            pieces=pieces,
														
 
															+            token_timestamps=token_timestamps,
														
 
															+            step_scores=step_scores,
														
 
															+        )
														
 
															+        return Transcription(stats)
														
 
															+
														
 
															+    @torch.inference_mode()
														
 
															+    def transcribe(
														
 
															+        self,
														
 
															+        audio: Union[str, Tensor],
														
 
															+        src_lang: str,
														
 
															+        filter_width: int = 3,
														
 
															+        sample_rate: int = 16000,
														
 
															+        **sequence_generator_options: Dict,
														
 
															+    ) -> Transcription:
														
 
															+        """
														
 
															+        The main method used to perform transcription.
														
 
															+
														
 
															+        :param audio:
														
 
															+            Either path to audio or audio Tensor.
														
 
															+        :param src_lang:
														
 
															+            Source language of audio.
														
 
															+        :param sample_rate:
														
 
															+            Sample rate of the audio Tensor.
														
 
															+        :param filter_width:
														
 
															+            Window size to pad weights tensor.
														
 
															+        :params **sequence_generator_options:
														
 
															+            See BeamSearchSeq2SeqGenerator.
														
 
															+
														
 
															+        :returns:
														
 
															+            - List of Tokens with timestamps.
														
 
															+        """
														
 
															+        if isinstance(audio, str):
														
 
															+            with Path(audio).open("rb") as fb:
														
 
															+                block = MemoryBlock(fb.read())
														
 
															+            decoded_audio = self.decode_audio(block)
														
 
															+        else:
														
 
															+            decoded_audio = {
														
 
															+                "waveform": audio,
														
 
															+                "sample_rate": sample_rate,
														
 
															+                "format": -1,
														
 
															+            }
														
 
															+
														
 
															+        src = self.convert_to_fbank(decoded_audio)["fbank"]
														
 
															+
														
 
															+        length_seconds = (
														
 
															+            decoded_audio["waveform"].size(0) / decoded_audio["sample_rate"]
														
 
															+        )
														
 
															+
														
 
															+        return self.run_inference(
														
 
															+            src,
														
 
															+            src_lang,
														
 
															+            length_seconds,
														
 
															+            filter_width,
														
 
															+            sequence_generator_options,
														
 
															+        )