12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970 |
- # Copyright (c) Meta Platforms, Inc. and affiliates
- # All rights reserved.
- #
- # This source code is licensed under the license found in the
- # MIT_LICENSE file in the root directory of this source tree.
- from typing import Final
- import torch
- from torch import tensor
- from fairseq2.data.audio import AudioDecoderOutput
- from seamless_communication.models.aligner.alignment_extractor import AlignmentExtractor
- from tests.common import assert_equal, device, get_default_dtype
- REF_TEXT = "the examination and testimony of the experts enabled the commision to conclude that five shots may have been fired"
- # fmt: off
- REF_DURATIONS_FP16: Final = [[ 1, 1, 2, 1, 1, 5, 5, 6, 4, 3, 2, 3, 4, 4, 2, 2, 2, 1,
- 1, 1, 3, 3, 3, 4, 3, 3, 3, 4, 4, 3, 2, 2, 1, 1, 1, 1,
- 2, 4, 6, 5, 4, 3, 4, 5, 5, 16, 6, 3, 5, 5, 3, 3, 1, 2,
- 1, 1, 1, 2, 3, 2, 3, 1, 3, 3, 3, 2, 2, 4, 2, 2, 2, 3,
- 2, 4, 5, 4, 5, 8, 3, 17, 2, 2, 3, 2, 5, 4, 6, 3, 1, 1,
- 4, 4, 3, 5, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1,
- 2, 6, 4, 5, 9, 5, 1, 12]]
- # fmt: on
- # fmt: off
- REF_DURATIONS_FP32: Final = [[ 1, 1, 2, 1, 1, 5, 5, 6, 4, 3, 2, 3, 4, 4, 2, 2, 2, 1,
- 1, 1, 3, 3, 3, 4, 3, 3, 4, 3, 4, 3, 2, 2, 1, 1, 1, 1,
- 2, 4, 6, 5, 4, 3, 4, 5, 5, 16, 6, 3, 5, 5, 3, 3, 1, 2,
- 1, 1, 1, 2, 3, 2, 3, 1, 3, 3, 3, 2, 2, 4, 2, 2, 2, 3,
- 2, 4, 5, 4, 5, 8, 3, 17, 2, 2, 3, 2, 5, 4, 6, 3, 1, 1,
- 4, 4, 3, 5, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1,
- 2, 6, 4, 5, 9, 5, 1, 12]]
- # fmt: on
- def test_aligner(example_rate16k_audio: AudioDecoderOutput) -> None:
- aligner_name = "nar_t2u_aligner"
- unit_extractor_name = "xlsr2_1b_v2"
- unit_extractor_output_layer_n = 35
- unit_extractor_kmeans_uri = "https://dl.fbaipublicfiles.com/seamlessM4T/models/unit_extraction/kmeans_10k.npy"
- dtype = get_default_dtype()
- if dtype == torch.float32:
- ref_tensor = REF_DURATIONS_FP32
- else:
- ref_tensor = REF_DURATIONS_FP16
- audio = example_rate16k_audio["waveform"].mean(
- 1
- ) # averaging mono to get [Time] shape required by aligner
- extractor = AlignmentExtractor(
- aligner_name,
- unit_extractor_name,
- unit_extractor_output_layer_n,
- unit_extractor_kmeans_uri,
- device=device,
- dtype=dtype,
- )
- alignment_durations, _, _ = extractor.extract_alignment(
- audio, REF_TEXT, plot=False, add_trailing_silence=True
- )
- assert_equal(
- alignment_durations, tensor(ref_tensor, device=device, dtype=torch.int64)
- )
|