large_M4T_v1.yaml 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. eval_data:
  2. audio:
  3. audio_root_dir: /fsx-ust/data/audio_zips/
  4. fbanks_num_mel_bins: 80
  5. fbanks_standardize_audio: true
  6. fbanks_waveform_scale: 32768
  7. fbank_feats_pad_idx: 0
  8. manifest_list: dev_fleurs_arb-eng,dev_fleurs_ben-eng,dev_fleurs_hin-eng,dev_fleurs_ind-eng,dev_fleurs_ita-eng,dev_fleurs_jpn-eng,dev_fleurs_por-eng,dev_fleurs_rus-eng,dev_fleurs_swh-eng,dev_fleurs_tha-eng,dev_fleurs_tur-eng,dev_fleurs_urd-eng,dev_fleurs_vie-eng,dev_fleurs_spa-eng,dev_fleurs_eng-arb,dev_fleurs_eng-ben,dev_fleurs_eng-hin,dev_fleurs_eng-ind,dev_fleurs_eng-ita,dev_fleurs_eng-jpn,dev_fleurs_eng-por,dev_fleurs_eng-rus,dev_fleurs_eng-swh,dev_fleurs_eng-tha,dev_fleurs_eng-tur,dev_fleurs_eng-urd,dev_fleurs_eng-vie,dev_fleurs_eng-spa
  9. manifest_list_path: null
  10. manifest_path_prefix: /fsx-ust/spopuri/datasets/S2ST/V1/M4T_V1_phase2/primary/
  11. max_seconds_per_input_audio: 150
  12. fixed_batch_size: 40
  13. max_tgt_text_tokens_per_batch: null
  14. max_tgt_text_tokens_per_sample: 3000
  15. max_units_per_sample: 1500
  16. num_threads: 10
  17. prefech_batches: 10
  18. prepend_tgt_lang_tag: true
  19. shuffle_window: 1000
  20. text_tokenization:
  21. from_model: seamlessM4T_large
  22. spm_path: null
  23. langtoks: null
  24. unit_tokenization:
  25. from_model: seamlessM4T_large
  26. langtoks: null
  27. num_units: null
  28. unit_tokenizer_name: seamlessM4T_large
  29. model:
  30. custom_params:
  31. nllb_vocabulary_size: 256103
  32. from_model: null
  33. from_model_config: null
  34. pretrained_s2t_decoder_path: /fsx-ust/spopuri/datasets/PT_CKPT/S2T/S2T_M4T_V1_V1_cleaned.pt
  35. pretrained_t2u_path: /fsx-ust/spopuri/datasets/PT_CKPT/T2U/V5_10K_p2_14_80K.pt
  36. pretrained_w2v2_path: /fsx-ust/spopuri/datasets/PT_CKPT/w2v2/w2vbert2rpq_600m_al5.pt
  37. train_data:
  38. audio:
  39. audio_root_dir: /fsx-ust/data/audio_zips/
  40. fbanks_num_mel_bins: 80
  41. fbanks_standardize_audio: true
  42. fbanks_waveform_scale: 32768
  43. fbank_feats_pad_idx: 0
  44. manifest_list: null
  45. manifest_list_path: /data/home/mavlyutov/train_configs/m4t_v1_train_manifests.txt
  46. manifest_path_prefix: /fsx-ust/spopuri/datasets/S2ST/V1/M4T_V1_phase2/primary
  47. max_seconds_per_input_audio: 15
  48. fixed_batch_size: null
  49. max_tgt_text_tokens_per_batch: 600
  50. max_tgt_text_tokens_per_sample: 300
  51. max_units_per_sample: 1500
  52. num_threads: 10
  53. prefech_batches: 10
  54. prepend_tgt_lang_tag: true
  55. shuffle_window: 1000
  56. text_tokenization:
  57. from_model: seamlessM4T_large
  58. spm_path: null
  59. langtoks: null
  60. unit_tokenization:
  61. from_model: seamlessM4T_large
  62. langtoks: null
  63. num_units: null
  64. unit_tokenizer_name: seamlessM4T_large
  65. training:
  66. eval_steps: 5000
  67. float_dtype: fp16
  68. label_smoothing: 0.2
  69. learning_rate: 0.0001
  70. log_steps: 200
  71. max_epochs: 100
  72. patience: 10
  73. start_learning_rate: 1.0e-07
  74. warmup_steps: 1000