lang_list.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. # Language dict
  2. language_code_to_name = {
  3. "afr": "Afrikaans",
  4. "amh": "Amharic",
  5. "arb": "Modern Standard Arabic",
  6. "ary": "Moroccan Arabic",
  7. "arz": "Egyptian Arabic",
  8. "asm": "Assamese",
  9. "ast": "Asturian",
  10. "azj": "North Azerbaijani",
  11. "bel": "Belarusian",
  12. "ben": "Bengali",
  13. "bos": "Bosnian",
  14. "bul": "Bulgarian",
  15. "cat": "Catalan",
  16. "ceb": "Cebuano",
  17. "ces": "Czech",
  18. "ckb": "Central Kurdish",
  19. "cmn": "Mandarin Chinese",
  20. "cym": "Welsh",
  21. "dan": "Danish",
  22. "deu": "German",
  23. "ell": "Greek",
  24. "eng": "English",
  25. "est": "Estonian",
  26. "eus": "Basque",
  27. "fin": "Finnish",
  28. "fra": "French",
  29. "gaz": "West Central Oromo",
  30. "gle": "Irish",
  31. "glg": "Galician",
  32. "guj": "Gujarati",
  33. "heb": "Hebrew",
  34. "hin": "Hindi",
  35. "hrv": "Croatian",
  36. "hun": "Hungarian",
  37. "hye": "Armenian",
  38. "ibo": "Igbo",
  39. "ind": "Indonesian",
  40. "isl": "Icelandic",
  41. "ita": "Italian",
  42. "jav": "Javanese",
  43. "jpn": "Japanese",
  44. "kam": "Kamba",
  45. "kan": "Kannada",
  46. "kat": "Georgian",
  47. "kaz": "Kazakh",
  48. "kea": "Kabuverdianu",
  49. "khk": "Halh Mongolian",
  50. "khm": "Khmer",
  51. "kir": "Kyrgyz",
  52. "kor": "Korean",
  53. "lao": "Lao",
  54. "lit": "Lithuanian",
  55. "ltz": "Luxembourgish",
  56. "lug": "Ganda",
  57. "luo": "Luo",
  58. "lvs": "Standard Latvian",
  59. "mai": "Maithili",
  60. "mal": "Malayalam",
  61. "mar": "Marathi",
  62. "mkd": "Macedonian",
  63. "mlt": "Maltese",
  64. "mni": "Meitei",
  65. "mya": "Burmese",
  66. "nld": "Dutch",
  67. "nno": "Norwegian Nynorsk",
  68. "nob": "Norwegian Bokm\u00e5l",
  69. "npi": "Nepali",
  70. "nya": "Nyanja",
  71. "oci": "Occitan",
  72. "ory": "Odia",
  73. "pan": "Punjabi",
  74. "pbt": "Southern Pashto",
  75. "pes": "Western Persian",
  76. "pol": "Polish",
  77. "por": "Portuguese",
  78. "ron": "Romanian",
  79. "rus": "Russian",
  80. "slk": "Slovak",
  81. "slv": "Slovenian",
  82. "sna": "Shona",
  83. "snd": "Sindhi",
  84. "som": "Somali",
  85. "spa": "Spanish",
  86. "srp": "Serbian",
  87. "swe": "Swedish",
  88. "swh": "Swahili",
  89. "tam": "Tamil",
  90. "tel": "Telugu",
  91. "tgk": "Tajik",
  92. "tgl": "Tagalog",
  93. "tha": "Thai",
  94. "tur": "Turkish",
  95. "ukr": "Ukrainian",
  96. "urd": "Urdu",
  97. "uzn": "Northern Uzbek",
  98. "vie": "Vietnamese",
  99. "xho": "Xhosa",
  100. "yor": "Yoruba",
  101. "yue": "Cantonese",
  102. "zlm": "Colloquial Malay",
  103. "zsm": "Standard Malay",
  104. "zul": "Zulu",
  105. }
  106. LANGUAGE_NAME_TO_CODE = {v: k for k, v in language_code_to_name.items()}
  107. # Source langs: S2ST / S2TT / ASR don't need source lang
  108. # T2TT / T2ST use this
  109. text_source_language_codes = [
  110. "afr",
  111. "amh",
  112. "arb",
  113. "ary",
  114. "arz",
  115. "asm",
  116. "azj",
  117. "bel",
  118. "ben",
  119. "bos",
  120. "bul",
  121. "cat",
  122. "ceb",
  123. "ces",
  124. "ckb",
  125. "cmn",
  126. "cym",
  127. "dan",
  128. "deu",
  129. "ell",
  130. "eng",
  131. "est",
  132. "eus",
  133. "fin",
  134. "fra",
  135. "gaz",
  136. "gle",
  137. "glg",
  138. "guj",
  139. "heb",
  140. "hin",
  141. "hrv",
  142. "hun",
  143. "hye",
  144. "ibo",
  145. "ind",
  146. "isl",
  147. "ita",
  148. "jav",
  149. "jpn",
  150. "kan",
  151. "kat",
  152. "kaz",
  153. "khk",
  154. "khm",
  155. "kir",
  156. "kor",
  157. "lao",
  158. "lit",
  159. "lug",
  160. "luo",
  161. "lvs",
  162. "mai",
  163. "mal",
  164. "mar",
  165. "mkd",
  166. "mlt",
  167. "mni",
  168. "mya",
  169. "nld",
  170. "nno",
  171. "nob",
  172. "npi",
  173. "nya",
  174. "ory",
  175. "pan",
  176. "pbt",
  177. "pes",
  178. "pol",
  179. "por",
  180. "ron",
  181. "rus",
  182. "slk",
  183. "slv",
  184. "sna",
  185. "snd",
  186. "som",
  187. "spa",
  188. "srp",
  189. "swe",
  190. "swh",
  191. "tam",
  192. "tel",
  193. "tgk",
  194. "tgl",
  195. "tha",
  196. "tur",
  197. "ukr",
  198. "urd",
  199. "uzn",
  200. "vie",
  201. "yor",
  202. "yue",
  203. "zsm",
  204. "zul",
  205. ]
  206. TEXT_SOURCE_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in text_source_language_codes])
  207. # Target langs:
  208. # S2ST / T2ST
  209. s2st_target_language_codes = [
  210. "eng",
  211. "arb",
  212. "ben",
  213. "cat",
  214. "ces",
  215. "cmn",
  216. "cym",
  217. "dan",
  218. "deu",
  219. "est",
  220. "fin",
  221. "fra",
  222. "hin",
  223. "ind",
  224. "ita",
  225. "jpn",
  226. "kor",
  227. "mlt",
  228. "nld",
  229. "pes",
  230. "pol",
  231. "por",
  232. "ron",
  233. "rus",
  234. "slk",
  235. "spa",
  236. "swe",
  237. "swh",
  238. "tel",
  239. "tgl",
  240. "tha",
  241. "tur",
  242. "ukr",
  243. "urd",
  244. "uzn",
  245. "vie",
  246. ]
  247. S2ST_TARGET_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in s2st_target_language_codes])
  248. T2ST_TARGET_LANGUAGE_NAMES = S2ST_TARGET_LANGUAGE_NAMES
  249. # S2TT / T2TT / ASR
  250. S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
  251. T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
  252. ASR_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES