2 years ago · 519f46576e
--- a/.gitignore
+++ b/.gitignore
@@ -143,4 +143,6 @@ outputs
 
															 # symlinks
														
 
															 seamless_communication
														
 
															-m4t_scripts
														
 
															+# ignore src/seamless_communication  
														
 
															+!*/seamless_communication
														
 
															+m4t_scripts
														
--- a/scripts/m4t/finetune/dataset.py
+++ b/scripts/m4t/finetune/dataset.py
@@ -12,9 +12,13 @@ import logging
 
															 import os
														
 
															 from pathlib import Path
														
 
															+import torch
														
 
															+
														
 
															 from seamless_communication.datasets.huggingface import (
														
 
															     Speech2SpeechFleursDatasetBuilder,
														
 
															+    SpeechTokenizer,
														
 
															 )
														
 
															+from seamless_communication.models.unit_extraction import UnitExtractor
														
 
															 logging.basicConfig(
														
 
															     level=logging.INFO,
														
@@ -91,6 +95,28 @@ def _check_lang_code_mapping(lang: str) -> None:
 
															         )
														
 
															+class UnitSpeechTokenizer(SpeechTokenizer):
														
 
															+    MODEL_NAME = "xlsr2_1b_v2"
														
 
															+    KMEANS_MODEL_URI = "https://dl.fbaipublicfiles.com/seamlessM4T/models/unit_extraction/kmeans_10k.npy"
														
 
															+    OUTPUT_LAYER_IDX = 34
														
 
															+
														
 
															+    def __init__(self, device: torch.device):
														
 
															+        super().__init__()
														
 
															+        self.device = device
														
 
															+        self.unit_extractor = UnitExtractor(
														
 
															+            model_name_or_card=self.MODEL_NAME,
														
 
															+            kmeans_uri=self.KMEANS_MODEL_URI,
														
 
															+            device=self.device,
														
 
															+        )
														
 
															+
														
 
															+    def encode(self, wav: torch.Tensor, sample_rate: int) -> torch.Tensor:
														
 
															+        return self.unit_extractor.predict(
														
 
															+            wav.to(self.device),
														
 
															+            out_layer_idx=self.OUTPUT_LAYER_IDX,
														
 
															+            sample_rate=sample_rate,
														
 
															+        )
														
 
															+
														
 
															+
														
 
															 def download_fleurs_dataset(
														
 
															     source_lang: str,
														
 
															     target_lang: str,
														
@@ -99,7 +125,10 @@ def download_fleurs_dataset(
 
															 ) -> str:
														
 
															     _check_lang_code_mapping(source_lang)
														
 
															     _check_lang_code_mapping(target_lang)
														
 
															-    tokenizer = None
														
 
															+    device = (
														
 
															+        torch.device("cuda:0") if torch.cuda.device_count() > 0 else torch.device("cpu")
														
 
															+    )
														
 
															+    tokenizer = UnitSpeechTokenizer(device=device)
														
 
															     dataset_iterator = Speech2SpeechFleursDatasetBuilder(
														
 
															         source_lang=UNITY_TO_FLEURS_LANG_MAPPING[source_lang],
														
 
															         target_lang=UNITY_TO_FLEURS_LANG_MAPPING[target_lang],
														
@@ -111,7 +140,7 @@ def download_fleurs_dataset(
 
															     )
														
 
															     manifest_path: str = os.path.join(save_directory, f"{split}_manifest.json")
														
 
															     with open(manifest_path, "w") as fp_out:
														
 
															-        for idx, sample in enumerate(dataset_iterator, start=1):
														
 
															+        for idx, sample in enumerate(dataset_iterator.__iter__(), start=1):
														
 
															             # correction as FleursDatasetBuilder return fleurs lang codes
														
 
															             sample.source.lang = source_lang
														
 
															             sample.target.lang = target_lang
														
--- a/src/seamless_communication/datasets/huggingface.py
+++ b/src/seamless_communication/datasets/huggingface.py
@@ -7,7 +7,8 @@
 
															 import logging
														
 
															 import os
														
 
															-from typing import Any, Dict, Iterable, Optional
														
 
															+from abc import abstractmethod
														
 
															+from typing import Dict, Iterable, Optional
														
 
															 import numpy as np
														
 
															 import torch
														
@@ -18,6 +19,12 @@ from .datatypes import LangPairSample, MultimodalSample
 
															 logger = logging.getLogger(__name__)
														
 
															+class SpeechTokenizer:
														
 
															+    @abstractmethod
														
 
															+    def encode(self, wav: torch.Tensor, sample_rate: int) -> torch.Tensor:
														
 
															+        ...
														
 
															+
														
 
															+
														
 
															 class Speech2SpeechFleursDatasetBuilder:
														
 
															     """Assembles speech2speech dataset from google/fleurs on HuggingFace"""
														
@@ -32,7 +39,7 @@ class Speech2SpeechFleursDatasetBuilder:
 
															         skip_target_audio: bool = True,
														
 
															         audio_dtype: torch.dtype = torch.float32,
														
 
															         dataset_cache_dir: Optional[str] = None,
														
 
															-        speech_tokenizer: Optional[Any] = None,
														
 
															+        speech_tokenizer: Optional[SpeechTokenizer] = None,
														
 
															     ):
														
 
															         self.source_lang = source_lang
														
 
															         self.target_lang = target_lang
														
@@ -65,7 +72,11 @@ class Speech2SpeechFleursDatasetBuilder:
 
															             waveform = None
														
 
															         if self.speech_tokenizer is not None and not should_skip_audio:
														
 
															             assert waveform is not None
														
 
															-            units = self.speech_tokenizer.encode(waveform.unsqueeze(0))[0].tolist()
														
 
															+            assert sampling_rate is not None
														
 
															+            units_tensor = self.speech_tokenizer.encode(
														
 
															+                waveform, sampling_rate
														
 
															+            ).reshape(-1)
														
 
															+            units = units_tensor.tolist()
														
 
															         else:
														
 
															             units = None
														
 
															         return MultimodalSample(