asr_small_wh_transc.yaml 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. eval_data:
  2. audio:
  3. audio_root_dir: /fsx-ust/data/audio_zips/
  4. fbanks_num_mel_bins: 80
  5. fbanks_standardize_audio: true
  6. fbanks_waveform_scale: 32768
  7. fbank_feats_pad_idx: 0
  8. manifest_list: dev_asr_only_aggregated_adapted
  9. manifest_list_path: null
  10. manifest_path_prefix: /data/home/mavlyutov/s2t_ondevice/
  11. max_seconds_per_input_audio: 15
  12. fixed_batch_size: 40
  13. max_tgt_text_tokens_per_batch: 1000
  14. max_tgt_text_tokens_per_sample: 300
  15. max_units_per_sample: 1500
  16. num_threads: 5
  17. prefech_batches: null
  18. prepend_tgt_lang_tag: true
  19. shuffle_window: 1000
  20. text_tokenization:
  21. from_model: null
  22. langtoks:
  23. - eng
  24. - rus
  25. - hin
  26. - por
  27. - spa
  28. spm_path: /data/home/mavlyutov/s2t_ondevice/vocab20k/5_5_20k.model
  29. unit_tokenization:
  30. from_model: seamlessM4T_large
  31. langtoks: null
  32. num_units: null
  33. unit_tokenizer_name: seamlessM4T_large
  34. model:
  35. custom_params:
  36. model_embed_dim: 768
  37. nllb_decoder_layers: 3
  38. nllb_encoder_layers: 1
  39. nllb_vocabulary_size: 256102
  40. t2u_decoder_layers: 1
  41. t2u_encoder_layers: 1
  42. unit_vocabulary_size: 10082
  43. w2v2_encoder_layers: 6
  44. w2v2_encoder_layers_layernorm_features: false
  45. w2v2_encoder_layers_use_conformer: true
  46. w2v2_num_pos_conv_groups: 0
  47. w2v2_pos_conv_kernel_size: 0
  48. w2v2_pos_encoder_depth: 0
  49. w2v2_pos_encoder_type: relative
  50. from_model: null
  51. from_model_config: null
  52. pretrained_s2t_decoder_path: null
  53. pretrained_t2u_path: null
  54. pretrained_w2v2_path: null
  55. train_data:
  56. audio:
  57. audio_root_dir: /fsx-ust/data/audio_zips/
  58. fbanks_num_mel_bins: 80
  59. fbanks_standardize_audio: true
  60. fbanks_waveform_scale: 32768
  61. fbank_feats_pad_idx: 0
  62. manifest_list: train_asr_only_aggregated_5_dial_filtered_adapted_wh_transc
  63. manifest_list_path: null
  64. manifest_path_prefix: /data/home/mavlyutov/s2t_ondevice/
  65. max_seconds_per_input_audio: 15
  66. fixed_batch_size: 40
  67. max_tgt_text_tokens_per_batch: 600
  68. max_tgt_text_tokens_per_sample: 300
  69. max_units_per_sample: 1500
  70. num_threads: 4
  71. prefech_batches: null
  72. prepend_tgt_lang_tag: true
  73. shuffle_window: 1000
  74. text_tokenization:
  75. from_model: null
  76. langtoks:
  77. - eng
  78. - rus
  79. - hin
  80. - por
  81. - spa
  82. spm_path: /data/home/mavlyutov/s2t_ondevice/vocab20k/5_5_20k.model
  83. unit_tokenization:
  84. from_model: seamlessM4T_large
  85. langtoks: null
  86. num_units: null
  87. unit_tokenizer_name: seamlessM4T_large
  88. training:
  89. eval_steps: 1000
  90. float_dtype: fp32
  91. label_smoothing: 0.2
  92. learning_rate: 0.0001
  93. log_steps: 50
  94. max_epochs: 100
  95. patience: 10
  96. start_learning_rate: 1.0e-07
  97. warmup_steps: 1000