2 years ago · fb59ee0a49
--- a/src/seamless_communication/inference/translator.py
+++ b/src/seamless_communication/inference/translator.py
@@ -151,7 +151,7 @@ class Translator(nn.Module):
 
															             output_modality is None or output_modality == Modality.SPEECH
														
 
															         ):
														
 
															             self.vocoder = load_vocoder_model(
														
 
															-                vocoder_name_or_card, device=device, dtype=torch.float32
														
 
															+                vocoder_name_or_card, device=device, dtype=dtype
														
 
															             )
														
 
															             self.vocoder.eval()
														
@@ -232,7 +232,7 @@ class Translator(nn.Module):
 
															         unit_generation_ngram_filtering: bool = False,
														
 
															         duration_factor: float = 1.0,
														
 
															         prosody_encoder_input: Optional[SequenceData] = None,
														
 
															-        src_text: Optional[str] = None 
														
 
															+        src_text: Optional[str] = None,
														
 
															     ) -> Tuple[List[StringLike], Optional[BatchedSpeechOutput]]:
														
 
															         """
														
 
															         The main method used to perform inference on all tasks.
														
@@ -259,15 +259,17 @@ class Translator(nn.Module):
 
															             Optional source transcript (obtained by ASR for instance). This is used for
														
 
															             applying mintox toxicity mitigation. If this is not specify and apply_mintox=True
														
 
															             then src_lang must be specified and ASR will be run on the audio source.
														
 
															-            
														
 
															+
														
 
															         :returns:
														
 
															             - Batched list of Translated text.
														
 
															             - Translated BatchedSpeechOutput.
														
 
															         """
														
 
															         input_modality, output_modality = self.get_modalities_from_task_str(task_str)
														
 
															-        if self.apply_mintox and not (src_lang is not None or src_text is not None) :
														
 
															-            raise ValueError("`src_lang` must be specified when `apply_mintox` is `True` or you need to specify src_text.")
														
 
															+        if self.apply_mintox and not (src_lang is not None or src_text is not None):
														
 
															+            raise ValueError(
														
 
															+                "`src_lang` must be specified when `apply_mintox` is `True` or you need to specify src_text."
														
 
															+            )
														
 
															         if isinstance(input, dict):
														
 
															             src = cast(SequenceData, input)
														
@@ -384,20 +386,18 @@ class Translator(nn.Module):
 
															             audio_wavs = []
														
 
															             speech_units = []
														
 
															-            for i in range(len(unit_output.units)):
														
 
															-                u = units[i].cpu().numpy().tolist()
														
 
															-                index_of_first_one = next(
														
 
															-                    (index for index, value in enumerate(u) if value == 1), len(u)
														
 
															+            for i in range(len(units)):
														
 
															+                padding_mask = (
														
 
															+                    units[i] != self.model.t2u_model.target_vocab_info.pad_idx
														
 
															                 )
														
 
															-                u = u[:index_of_first_one]
														
 
															-                speech_units.append(u)
														
 
															+                u = units[i][padding_mask]
														
 
															                 if self.vocoder is not None:
														
 
															                     # TODO: Implement batched inference for vocoder.
														
 
															                     translated_audio_wav = self.vocoder(
														
 
															                         u, tgt_lang, spkr, dur_prediction=duration_prediction
														
 
															                     )
														
 
															                     audio_wavs.append(translated_audio_wav)
														
 
															-
														
 
															+                speech_units.append(u.tolist())
														
 
															             return (
														
 
															                 text_output.sentences,
														
 
															                 BatchedSpeechOutput(
														
--- a/src/seamless_communication/models/unit_extractor/unit_extractor.py
+++ b/src/seamless_communication/models/unit_extractor/unit_extractor.py
@@ -5,9 +5,8 @@
 
															 # LICENSE file in the root directory of this source tree.
														
 
															 import logging
														
 
															-from itertools import groupby
														
 
															 from pathlib import Path
														
 
															-from typing import List, Union
														
 
															+from typing import Union
														
 
															 import torch
														
 
															 import torch.nn.functional as F
														
@@ -103,15 +102,11 @@ class UnitExtractor(nn.Module):
 
															         units: Tensor,
														
 
															         src_lang: str,
														
 
															         device: Device,
														
 
															-        vocoder_name: str = "vocoder_36langs",
														
 
															+        dtype: DataType,
														
 
															+        vocoder_name: str = "vocoder_v2",
														
 
															     ) -> Tensor:
														
 
															-        def reduce_list(lst: List[Tensor]) -> List[Tensor]:
														
 
															-            return [key for key, _ in groupby(lst)]
														
 
															-
														
 
															-        reduced_units = reduce_list(units.cpu().tolist())
														
 
															-
														
 
															-        vocoder = load_vocoder_model(vocoder_name, device=device, dtype=torch.float32)
														
 
															+        vocoder = load_vocoder_model(vocoder_name, device=device, dtype=dtype)
														
 
															         vocoder.eval()
														
 
															         assert isinstance(vocoder, Vocoder)
														
 
															-        wav = vocoder(reduced_units, src_lang, spkr=-1, dur_prediction=True)
														
 
															+        wav = vocoder(units, src_lang, spkr=-1, dur_prediction=True)
														
 
															         return wav  # type: ignore[no-any-return]
														
--- a/src/seamless_communication/models/vocoder/builder.py
+++ b/src/seamless_communication/models/vocoder/builder.py
@@ -5,7 +5,7 @@
 
															 # LICENSE file in the root directory of this source tree.
														
 
															 from dataclasses import dataclass
														
 
															-from typing import Dict, List, Optional
														
 
															+from typing import Any, Dict, List, Optional
														
 
															 from fairseq2.models.utils.arch_registry import ArchitectureRegistry
														
 
															 from fairseq2.typing import DataType, Device
														
@@ -32,7 +32,7 @@ class VocoderConfig:
 
															     num_langs: int
														
 
															     spkr_embedding_dim: int
														
 
															     num_spkrs: int
														
 
															-    lang_spkr_idx_map: Dict
														
 
															+    lang_spkr_idx_map: Dict[str, Any]
														
 
															 vocoder_archs = ArchitectureRegistry[VocoderConfig]("vocoder_code_hifigan")
														
@@ -93,7 +93,6 @@ class VocoderBuilder:
 
															             The data type of module parameters and buffers.
														
 
															         """
														
 
															         self.config = config
														
 
															-
														
 
															         self.device, self.dtype = device, dtype
														
 
															     def build_model(self) -> Vocoder:
														
@@ -114,8 +113,8 @@ class VocoderBuilder:
 
															             self.config.spkr_embedding_dim,
														
 
															             self.config.num_spkrs,
														
 
															         )
														
 
															+        code_generator.to(device=self.device, dtype=self.dtype)
														
 
															         vocoder = Vocoder(code_generator, self.config.lang_spkr_idx_map)
														
 
															-        vocoder.to(dtype=self.dtype)
														
 
															         return vocoder
														
@@ -163,7 +162,7 @@ def _base_mel_vocoder() -> VocoderConfig:
 
															 @mel_vocoder_arch("24khz_mel")
														
 
															-def _base_mel_vocoder() -> VocoderConfig:
														
 
															+def _24khz_mel_vocoder() -> VocoderConfig:
														
 
															     return VocoderConfig(
														
 
															         upsample_rates=[5, 4, 4, 3],
														
 
															         upsample_kernel_sizes=[10, 8, 8, 6],
														
@@ -206,7 +205,7 @@ class MelVocoderBuilder:
 
															             self.config.resblock_dilation_sizes,
														
 
															             self.config.model_in_dim,
														
 
															         )
														
 
															-        generator.to(dtype=self.dtype, device=self.device)
														
 
															+        generator.to(device=self.device, dtype=self.dtype)
														
 
															         return generator
														
--- a/src/seamless_communication/models/vocoder/codehifigan.py
+++ b/src/seamless_communication/models/vocoder/codehifigan.py
@@ -73,7 +73,7 @@ class CodeGenerator(Generator):
 
															         return signal
														
 
															     def forward(self, sample: Dict[str, Any], dur_prediction: bool) -> Tensor:  # type: ignore
														
 
															-        x = sample["code"].clone().to(device=self.dict.weight.device)
														
 
															+        x = sample["code"]
														
 
															         x = self.dict(x).transpose(1, 2)
														
 
															         if self.dur_predictor and dur_prediction:
														
@@ -85,11 +85,11 @@ class CodeGenerator(Generator):
 
															             # B x C x T
														
 
															             x = torch.repeat_interleave(x, dur_out.view(-1), dim=2)
														
 
															-        spkr = self.spkr(sample["spkr"].to(self.spkr.weight.device)).transpose(1, 2)
														
 
															+        spkr = self.spkr(sample["spkr"]).transpose(1, 2)
														
 
															         spkr = self._upsample(spkr, x.shape[-1])
														
 
															         x = torch.cat([x, spkr], dim=1)
														
 
															-        lang = self.lang(sample["lang"].to(self.lang.weight.device)).transpose(1, 2)
														
 
															+        lang = self.lang(sample["lang"]).transpose(1, 2)
														
 
															         lang = self._upsample(lang, x.shape[-1])
														
 
															         x = torch.cat([lang, x], dim=1)
														
--- a/src/seamless_communication/models/vocoder/hifigan.py
+++ b/src/seamless_communication/models/vocoder/hifigan.py
@@ -6,15 +6,23 @@
 
															 from typing import List, Optional
														
 
															+import logging
														
 
															 import torch
														
 
															-import torch.nn as nn
														
 
															 import torch.nn.functional as F
														
 
															+
														
 
															 from torch import Tensor
														
 
															-from torch.nn import Conv1d, ConvTranspose1d
														
 
															+from torch.nn import Conv1d, ConvTranspose1d, Module, ModuleList
														
 
															 from torch.nn.utils.weight_norm import remove_weight_norm, weight_norm
														
 
															 LRELU_SLOPE = 0.1
														
 
															+logging.basicConfig(
														
 
															+    level=logging.INFO,
														
 
															+    format="%(asctime)s %(levelname)s -- %(name)s: %(message)s",
														
 
															+)
														
 
															+
														
 
															+logger = logging.getLogger(__name__)
														
 
															+
														
 
															 def init_weights(m, mean: float = 0.0, std: float = 0.01) -> None:  # type: ignore
														
 
															     classname = m.__class__.__name__
														
@@ -26,12 +34,12 @@ def get_padding(kernel_size: int, dilation: int = 1) -> int:
 
															     return (kernel_size * dilation - dilation) // 2
														
 
															-class ResBlock(torch.nn.Module):
														
 
															+class ResBlock(Module):
														
 
															     def __init__(
														
 
															         self, channels: int, kernel_size: int = 3, dilation: List[int] = [1, 3, 5]
														
 
															     ):
														
 
															         super(ResBlock, self).__init__()
														
 
															-        self.convs1 = nn.ModuleList(
														
 
															+        self.convs1 = ModuleList(
														
 
															             [
														
 
															                 weight_norm(
														
 
															                     Conv1d(
														
@@ -67,7 +75,7 @@ class ResBlock(torch.nn.Module):
 
															         )
														
 
															         self.convs1.apply(init_weights)
														
 
															-        self.convs2 = nn.ModuleList(
														
 
															+        self.convs2 = ModuleList(
														
 
															             [
														
 
															                 weight_norm(
														
 
															                     Conv1d(
														
@@ -119,7 +127,7 @@ class ResBlock(torch.nn.Module):
 
															             remove_weight_norm(layer)
														
 
															-class Generator(torch.nn.Module):
														
 
															+class Generator(Module):
														
 
															     def __init__(
														
 
															         self,
														
 
															         upsample_rates: List[int],
														
@@ -130,7 +138,7 @@ class Generator(torch.nn.Module):
 
															         model_in_dim: Optional[int],
														
 
															         add_ups_out_pad: bool = False,
														
 
															     ):
														
 
															-        super(Generator, self).__init__()
														
 
															+        super().__init__()
														
 
															         self.num_kernels = len(resblock_kernel_sizes)
														
 
															         self.num_upsamples = len(upsample_rates)
														
 
															         self.conv_pre = weight_norm(
														
@@ -143,7 +151,7 @@ class Generator(torch.nn.Module):
 
															             )
														
 
															         )
														
 
															-        self.ups = nn.ModuleList()
														
 
															+        self.ups = ModuleList()
														
 
															         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
														
 
															             out_pad = u % 2 if add_ups_out_pad else 0
														
 
															             self.ups.append(
														
@@ -159,7 +167,7 @@ class Generator(torch.nn.Module):
 
															                 )
														
 
															             )
														
 
															-        self.resblocks = nn.ModuleList()
														
 
															+        self.resblocks = ModuleList()
														
 
															         for i in range(len(self.ups)):
														
 
															             ch = upsample_initial_channel // (2 ** (i + 1))
														
 
															             for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes):
														
@@ -188,7 +196,7 @@ class Generator(torch.nn.Module):
 
															         return x
														
 
															     def remove_weight_norm(self) -> None:
														
 
															-        print("Removing weight norm...")
														
 
															+        logger.info("Removing weight norm in Generator.")
														
 
															         for layer in self.ups:
														
 
															             remove_weight_norm(layer)
														
 
															         for layer in self.resblocks:
														
--- a/src/seamless_communication/models/vocoder/loader.py
+++ b/src/seamless_communication/models/vocoder/loader.py
@@ -34,7 +34,7 @@ def convert_vocoder_checkpoint(
 
															     for key in old_state_dict:
														
 
															         new_key = f"code_generator.{key}"
														
 
															         new_state_dict[new_key] = old_state_dict[key]
														
 
															-    checkpoint["model"] = new_state_dict
														
 
															+    checkpoint["model"] = new_state_dict  # type: ignore
														
 
															     del checkpoint["generator"]  # type: ignore
														
 
															     return checkpoint
														
--- a/src/seamless_communication/models/vocoder/vocoder.py
+++ b/src/seamless_communication/models/vocoder/vocoder.py
@@ -4,36 +4,40 @@
 
															 # This source code is licensed under the license found in the
														
 
															 # LICENSE file in the root directory of this source tree.
														
 
															-from typing import Dict, List, Optional
														
 
															+from typing import Any, Dict, Optional
														
 
															 import torch
														
 
															-import torch.nn as nn
														
 
															 from torch import Tensor
														
 
															+from torch.nn import Module
														
 
															 from seamless_communication.models.vocoder.codehifigan import CodeGenerator
														
 
															-class Vocoder(nn.Module):
														
 
															-    def __init__(self, code_generator: CodeGenerator, lang_spkr_idx_map: Dict):
														
 
															-        super(Vocoder, self).__init__()
														
 
															+class Vocoder(Module):
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        code_generator: CodeGenerator,
														
 
															+        lang_spkr_idx_map: Dict[str, Any],
														
 
															+    ):
														
 
															+        super().__init__()
														
 
															         self.code_generator = code_generator
														
 
															         self.lang_spkr_idx_map = lang_spkr_idx_map
														
 
															     def forward(
														
 
															         self,
														
 
															-        code: List[int],
														
 
															+        units: Tensor,
														
 
															         lang: str,
														
 
															         spkr: Optional[int] = -1,
														
 
															         dur_prediction: bool = True,
														
 
															     ) -> Tensor:
														
 
															-        x = {
														
 
															-            "code": torch.LongTensor(code).view(1, -1),
														
 
															-        }
														
 
															         lang_idx = self.lang_spkr_idx_map["multilingual"][lang]
														
 
															         spkr_list = self.lang_spkr_idx_map["multispkr"][lang]
														
 
															         if not spkr:
														
 
															             spkr = -1
														
 
															         spkr = spkr_list[0] if spkr == -1 else spkr
														
 
															-        x["spkr"] = torch.tensor([[spkr]])
														
 
															-        x["lang"] = torch.tensor([[lang_idx]])
														
 
															-        return self.code_generator(x, dur_prediction)
														
 
															+        x = {
														
 
															+            "code": units.view(1, -1),
														
 
															+            "spkr": torch.tensor([[spkr]], device=units.device),
														
 
															+            "lang": torch.tensor([[lang_idx]], device=units.device),
														
 
															+        }
														
 
															+        return self.code_generator(x, dur_prediction)  # type: ignore[no-any-return]
														
--- a/src/seamless_communication/streaming/agents/online_vocoder.py
+++ b/src/seamless_communication/streaming/agents/online_vocoder.py
@@ -37,15 +37,14 @@ class VocoderAgent(TextToSpeechAgent):  # type: ignore
 
															                 return ReadAction()
														
 
															         tgt_lang = states.tgt_lang if states.tgt_lang else self.tgt_lang
														
 
															-        u = units[0][0].tolist()
														
 
															-        wav_samples = self.vocoder(u, tgt_lang, self.speaker_id, dur_prediction=False)[
														
 
															-            0
														
 
															-        ][0].tolist()
														
 
															+        u = units[0][0]
														
 
															+
														
 
															+        wav = self.vocoder(u, tgt_lang, self.speaker_id, dur_prediction=False)
														
 
															         states.source = []
														
 
															         return WriteAction(
														
 
															             SpeechSegment(
														
 
															-                content=wav_samples,
														
 
															+                content=wav[0][0].tolist(),
														
 
															                 finished=states.source_finished,
														
 
															                 sample_rate=self.sample_rate,
														
 
															                 tgt_lang=tgt_lang,
														
--- a/tests/integration/inference/test_translator.py
+++ b/tests/integration/inference/test_translator.py
@@ -6,9 +6,6 @@
 
															 from typing import Final
														
 
															-import torch
														
 
															-from fairseq2.typing import Device
														
 
															-
														
 
															 from seamless_communication.inference import Translator
														
 
															 from tests.common import device, get_default_dtype
														
--- a/tests/integration/models/test_pretssel_vocoder.py
+++ b/tests/integration/models/test_pretssel_vocoder.py
@@ -27,7 +27,7 @@ def test_pretssel_vocoder(example_rate16k_audio: AudioDecoderOutput) -> None:
 
															     feat = convert_to_collated_fbank(audio_dict, dtype=dtype)["seqs"][0]
														
 
															-    vocoder = load_mel_vocoder_model("vocoder_mel", device=device, dtype=torch.float32)
														
 
															+    vocoder = load_mel_vocoder_model("vocoder_mel", device=device, dtype=dtype)
														
 
															     vocoder.eval()
														
 
															     with torch.inference_mode():
														
--- a/tests/integration/models/test_unit_extractor.py
+++ b/tests/integration/models/test_unit_extractor.py
@@ -7,12 +7,12 @@
 
															 from typing import Final
														
 
															 import torch
														
 
															-from fairseq2.typing import Device
														
 
															 from torch import tensor
														
 
															+from fairseq2.typing import Device
														
 
															 from seamless_communication.inference import Translator
														
 
															 from seamless_communication.models.unit_extractor import UnitExtractor
														
 
															-from tests.common import assert_equal, device, get_default_dtype
														
 
															+from tests.common import assert_equal
														
 
															 # fmt: off
														
 
															 REF_ENG_UNITS: Final = [8976, 8299, 0, 0, 9692, 5395, 785, 785, 7805, 6193, 2922, 4806, 3362, 3560, 8119, 8119, 4335, 205, 5424, 5424, 5064, 7421, 6547, 9952, 3728, 8544, 3321, 1093, 1443, 7962, 3978, 8063, 5168, 5491, 9133, 9275, 5912, 8729, 5097, 5495, 1650, 5048, 2839, 6756, 5665, 4191, 5205, 5205, 9568, 9568, 5932, 1190, 9339, 5839, 5839, 6244, 5320, 3454, 5216, 721, 6994, 6513, 7754, 3469, 296, 1849, 3254, 3254, 5042, 5042, 3961, 2079, 1907, 1846, 661, 2225, 944, 9295, 4712, 1785, 6060, 8701, 7646, 1355, 2876, 8199, 5901, 8199, 3861, 5153, 6420, 2897, 1389, 334, 6334]
														
@@ -23,7 +23,9 @@ def test_unit_extractor() -> None:
 
															     model_name = "seamlessM4T_v2_large"
														
 
															     english_text = "Hello! I hope you're all doing well."
														
 
															-    dtype = get_default_dtype()
														
 
															+    # We can't test on the GPU since the output is non-deterministic.
														
 
															+    device = Device("cpu")
														
 
															+    dtype = torch.float32
														
 
															     translator = Translator(model_name, "vocoder_v2", device, dtype=dtype)