galamallikarjun commited on
Commit
6bb50ea
·
verified ·
1 Parent(s): cbe90c2

Upload 6 files

Browse files
Files changed (6) hide show
  1. README.md +5 -5
  2. UNESCO_META_HF_BANNER.png +0 -0
  3. app.py +154 -0
  4. flores.py +206 -0
  5. gitattributes +35 -0
  6. requirements.txt +8 -0
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: Sirmvitmallikarjun
3
- emoji: 🌖
4
- colorFrom: pink
5
- colorTo: pink
6
  sdk: gradio
7
- sdk_version: 5.44.1
8
  app_file: app.py
9
  pinned: false
10
  ---
 
1
  ---
2
+ title: NLLB
3
+ emoji: 🌐
4
+ colorFrom: red
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 4.32.2
8
  app_file: app.py
9
  pinned: false
10
  ---
UNESCO_META_HF_BANNER.png ADDED
app.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import gradio as gr
3
+ from sacremoses import MosesPunctNormalizer
4
+ from stopes.pipelines.monolingual.utils.sentence_split import get_split_algo
5
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
6
+ from flores import code_mapping
7
+ import platform
8
+ import torch
9
+ import nltk
10
+ from functools import lru_cache
11
+
12
+ nltk.download("punkt_tab")
13
+
14
+ REMOVED_TARGET_LANGUAGES = {"Ligurian", "Lombard", "Sicilian"}
15
+
16
+
17
+ device = "cpu" if platform.system() == "Darwin" else "cuda"
18
+ MODEL_NAME = "facebook/nllb-200-3.3B"
19
+
20
+ code_mapping = dict(sorted(code_mapping.items(), key=lambda item: item[0]))
21
+ flores_codes = list(code_mapping.keys())
22
+ target_languages = [language for language in flores_codes if not language in REMOVED_TARGET_LANGUAGES]
23
+
24
+ def load_model():
25
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
26
+ print(f"Model loaded in {device}")
27
+ return model
28
+
29
+
30
+ model = load_model()
31
+
32
+
33
+ # Loading the tokenizer once, because re-loading it takes about 1.5 seconds each time
34
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
35
+
36
+
37
+ punct_normalizer = MosesPunctNormalizer(lang="en")
38
+
39
+
40
+ @lru_cache(maxsize=202)
41
+ def get_language_specific_sentence_splitter(language_code):
42
+ short_code = language_code[:3]
43
+ splitter = get_split_algo(short_code, "default")
44
+ return splitter
45
+
46
+
47
+ # cache function
48
+ @lru_cache(maxsize=100)
49
+ def translate(text: str, src_lang: str, tgt_lang: str):
50
+ if not src_lang:
51
+ raise gr.Error("The source language is empty! Please choose it in the dropdown list.")
52
+ if not tgt_lang:
53
+ raise gr.Error("The target language is empty! Please choose it in the dropdown list.")
54
+ return _translate(text, src_lang, tgt_lang)
55
+
56
+
57
+ # Only assign GPU if cache not used
58
+ @spaces.GPU
59
+ def _translate(text: str, src_lang: str, tgt_lang: str):
60
+ src_code = code_mapping[src_lang]
61
+ tgt_code = code_mapping[tgt_lang]
62
+ tokenizer.src_lang = src_code
63
+ tokenizer.tgt_lang = tgt_code
64
+
65
+ # normalizing the punctuation first
66
+ text = punct_normalizer.normalize(text)
67
+
68
+ paragraphs = text.split("\n")
69
+ translated_paragraphs = []
70
+
71
+ for paragraph in paragraphs:
72
+ splitter = get_language_specific_sentence_splitter(src_code)
73
+ sentences = list(splitter(paragraph))
74
+ translated_sentences = []
75
+
76
+ for sentence in sentences:
77
+ input_tokens = (
78
+ tokenizer(sentence, return_tensors="pt")
79
+ .input_ids[0]
80
+ .cpu()
81
+ .numpy()
82
+ .tolist()
83
+ )
84
+ translated_chunk = model.generate(
85
+ input_ids=torch.tensor([input_tokens]).to(device),
86
+ forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_code),
87
+ max_length=len(input_tokens) + 50,
88
+ num_return_sequences=1,
89
+ num_beams=5,
90
+ no_repeat_ngram_size=4, # repetition blocking works better if this number is below num_beams
91
+ renormalize_logits=True, # recompute token probabilities after banning the repetitions
92
+ )
93
+ translated_chunk = tokenizer.decode(
94
+ translated_chunk[0], skip_special_tokens=True
95
+ )
96
+ translated_sentences.append(translated_chunk)
97
+
98
+ translated_paragraph = " ".join(translated_sentences)
99
+ translated_paragraphs.append(translated_paragraph)
100
+
101
+ return "\n".join(translated_paragraphs)
102
+
103
+
104
+
105
+ description = """
106
+ <div style="text-align: center;">
107
+ <img src="https://huggingface.co/spaces/UNESCO/nllb/resolve/main/UNESCO_META_HF_BANNER.png" alt="UNESCO Meta Hugging Face Banner" style="max-width: 800px; width: 100%; margin: 0 auto;">
108
+ <h1 style="color: #0077be;">UNESCO Language Translator, powered by Meta and Hugging Face</h1>
109
+ </div>
110
+
111
+ UNESCO, Meta, and Hugging Face have come together to create an accessible, high-quality translation experience in 200 languages.
112
+
113
+ This is made possible through an open approach to AI innovation using Meta's open-sourced No Language Left Behind (NLLB) AI model, hosted on Hugging Face Spaces.
114
+ """
115
+ disclaimer = """
116
+ ## Disclaimer
117
+
118
+ This translation interface, developed as part of UNESCO's work on Multilingualism and supported by Meta's No Language Left Behind AI model and Hugging Face, is designed to assist with language translation using open-source AI technologies. However, translations generated by the tool may not be accurate or perfect. While we strive to provide accurate translations, the tool may produce inaccuracies due to the complexity and nuances of different languages.
119
+
120
+ - The tool may not fully capture the context, cultural nuances, idiomatic expressions, or specific terminologies.
121
+ - Manual review and adjustment are recommended for important translations.
122
+ - The translations are provided "as is" without any warranties of any kind, either expressed or implied.
123
+ - Users should not rely solely on the tool for critical or sensitive translations and are responsible for verifying the accuracy and appropriateness of the translations for their specific needs.
124
+ - We recommend consulting with professional translators for official, legal, medical, or other critical translations.
125
+ - We shall not be liable for any direct, indirect, incidental, special, or consequential damages arising out of or in connection with the use or inability to use the translation tool, including but not limited to errors or omissions in translations.
126
+
127
+ By using this translation tool, you agree to these terms and acknowledge that the use of the tool is at your own risk.
128
+
129
+ For any feedback or support, please contact UNESCO World Atlas of Languages Team: [email protected].
130
+ """
131
+
132
+
133
+ examples_inputs = [["The United Nations Educational, Scientific and Cultural Organization is a specialized agency of the United Nations with the aim of promoting world peace and security through international cooperation in education, arts, sciences and culture. ","English","Ayacucho Quechua"],]
134
+
135
+ with gr.Blocks() as demo:
136
+ gr.Markdown(description)
137
+ with gr.Row():
138
+ src_lang = gr.Dropdown(label="Source Language", choices=flores_codes)
139
+ target_lang = gr.Dropdown(label="Target Language", choices=target_languages)
140
+ with gr.Row():
141
+ input_text = gr.Textbox(label="Input Text", lines=6)
142
+ with gr.Row():
143
+ btn = gr.Button("Translate text")
144
+ with gr.Row():
145
+ output = gr.Textbox(label="Output Text", lines=6)
146
+ btn.click(
147
+ translate,
148
+ inputs=[input_text, src_lang, target_lang],
149
+ outputs=output,
150
+ )
151
+ examples = gr.Examples(examples=examples_inputs,inputs=[input_text, src_lang,target_lang], fn=translate, outputs=output, cache_examples=True)
152
+ with gr.Row():
153
+ gr.Markdown(disclaimer)
154
+ demo.launch()
flores.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ code_mapping = {
2
+ "Acehnese (Arabic script)": "ace_Arab",
3
+ "Acehnese (Latin script)": "ace_Latn",
4
+ "Mesopotamian Arabic": "acm_Arab",
5
+ "Ta’izzi-Adeni Arabic": "acq_Arab",
6
+ "Tunisian Arabic": "aeb_Arab",
7
+ "Afrikaans": "afr_Latn",
8
+ "South Levantine Arabic": "ajp_Arab",
9
+ "Akan": "aka_Latn",
10
+ "Amharic": "amh_Ethi",
11
+ "North Levantine Arabic": "apc_Arab",
12
+ "Modern Standard Arabic": "arb_Arab",
13
+ # "Modern Standard Arabic (Romanized)": "arb_Latn", # it is in FLORES, but not in NLLB
14
+ "Najdi Arabic": "ars_Arab",
15
+ "Moroccan Arabic": "ary_Arab",
16
+ "Egyptian Arabic": "arz_Arab",
17
+ "Assamese": "asm_Beng",
18
+ "Asturian": "ast_Latn",
19
+ "Awadhi": "awa_Deva",
20
+ "Central Aymara": "ayr_Latn",
21
+ "South Azerbaijani": "azb_Arab",
22
+ "North Azerbaijani": "azj_Latn",
23
+ "Bashkir": "bak_Cyrl",
24
+ "Bambara": "bam_Latn",
25
+ "Balinese": "ban_Latn",
26
+ "Belarusian": "bel_Cyrl",
27
+ "Bemba": "bem_Latn",
28
+ "Bengali": "ben_Beng",
29
+ "Bhojpuri": "bho_Deva",
30
+ "Banjar (Arabic script)": "bjn_Arab",
31
+ "Banjar (Latin script)": "bjn_Latn",
32
+ "Standard Tibetan": "bod_Tibt",
33
+ "Bosnian": "bos_Latn",
34
+ "Buginese": "bug_Latn",
35
+ "Bulgarian": "bul_Cyrl",
36
+ "Catalan": "cat_Latn",
37
+ "Cebuano": "ceb_Latn",
38
+ "Czech": "ces_Latn",
39
+ "Chokwe": "cjk_Latn",
40
+ "Central Kurdish": "ckb_Arab",
41
+ "Crimean Tatar": "crh_Latn",
42
+ "Welsh": "cym_Latn",
43
+ "Danish": "dan_Latn",
44
+ "German": "deu_Latn",
45
+ "Southwestern Dinka": "dik_Latn",
46
+ "Dyula": "dyu_Latn",
47
+ "Dzongkha": "dzo_Tibt",
48
+ "Greek": "ell_Grek",
49
+ "English": "eng_Latn",
50
+ "Esperanto": "epo_Latn",
51
+ "Estonian": "est_Latn",
52
+ "Basque": "eus_Latn",
53
+ "Ewe": "ewe_Latn",
54
+ "Faroese": "fao_Latn",
55
+ "Fijian": "fij_Latn",
56
+ "Finnish": "fin_Latn",
57
+ "Fon": "fon_Latn",
58
+ "French": "fra_Latn",
59
+ "Friulian": "fur_Latn",
60
+ "Nigerian Fulfulde": "fuv_Latn",
61
+ "Scottish Gaelic": "gla_Latn",
62
+ "Irish": "gle_Latn",
63
+ "Galician": "glg_Latn",
64
+ "Guarani": "grn_Latn",
65
+ "Gujarati": "guj_Gujr",
66
+ "Haitian Creole": "hat_Latn",
67
+ "Hausa": "hau_Latn",
68
+ "Hebrew": "heb_Hebr",
69
+ "Hindi": "hin_Deva",
70
+ "Chhattisgarhi": "hne_Deva",
71
+ "Croatian": "hrv_Latn",
72
+ "Hungarian": "hun_Latn",
73
+ "Armenian": "hye_Armn",
74
+ "Igbo": "ibo_Latn",
75
+ "Ilocano": "ilo_Latn",
76
+ "Indonesian": "ind_Latn",
77
+ "Icelandic": "isl_Latn",
78
+ "Italian": "ita_Latn",
79
+ "Javanese": "jav_Latn",
80
+ "Japanese": "jpn_Jpan",
81
+ "Kabyle": "kab_Latn",
82
+ "Jingpho": "kac_Latn",
83
+ "Kamba": "kam_Latn",
84
+ "Kannada": "kan_Knda",
85
+ "Kashmiri (Arabic script)": "kas_Arab",
86
+ "Kashmiri (Devanagari script)": "kas_Deva",
87
+ "Georgian": "kat_Geor",
88
+ "Central Kanuri (Arabic script)": "knc_Arab",
89
+ "Central Kanuri (Latin script)": "knc_Latn",
90
+ "Kazakh": "kaz_Cyrl",
91
+ "Kabiyè": "kbp_Latn",
92
+ "Kabuverdianu": "kea_Latn",
93
+ "Khmer": "khm_Khmr",
94
+ "Kikuyu": "kik_Latn",
95
+ "Kinyarwanda": "kin_Latn",
96
+ "Kyrgyz": "kir_Cyrl",
97
+ "Kimbundu": "kmb_Latn",
98
+ "Northern Kurdish": "kmr_Latn",
99
+ "Kikongo": "kon_Latn",
100
+ "Korean": "kor_Hang",
101
+ "Lao": "lao_Laoo",
102
+ "Ligurian": "lij_Latn",
103
+ "Limburgish": "lim_Latn",
104
+ "Lingala": "lin_Latn",
105
+ "Lithuanian": "lit_Latn",
106
+ "Lombard": "lmo_Latn",
107
+ "Latgalian": "ltg_Latn",
108
+ "Luxembourgish": "ltz_Latn",
109
+ "Luba-Kasai": "lua_Latn",
110
+ "Ganda": "lug_Latn",
111
+ "Luo": "luo_Latn",
112
+ "Mizo": "lus_Latn",
113
+ "Standard Latvian": "lvs_Latn",
114
+ "Magahi": "mag_Deva",
115
+ "Maithili": "mai_Deva",
116
+ "Malayalam": "mal_Mlym",
117
+ "Marathi": "mar_Deva",
118
+ # "Minangkabau (Arabic script)": "min_Arab", # it is in FLORES, but not in NLLB
119
+ "Minangkabau (Latin script)": "min_Latn",
120
+ "Macedonian": "mkd_Cyrl",
121
+ "Plateau Malagasy": "plt_Latn",
122
+ "Maltese": "mlt_Latn",
123
+ "Meitei (Bengali script)": "mni_Beng",
124
+ "Halh Mongolian": "khk_Cyrl",
125
+ "Mossi": "mos_Latn",
126
+ "Maori": "mri_Latn",
127
+ "Burmese": "mya_Mymr",
128
+ "Dutch": "nld_Latn",
129
+ "Norwegian Nynorsk": "nno_Latn",
130
+ "Norwegian Bokmål": "nob_Latn",
131
+ "Nepali": "npi_Deva",
132
+ "Northern Sotho": "nso_Latn",
133
+ "Nuer": "nus_Latn",
134
+ "Nyanja": "nya_Latn",
135
+ "Occitan": "oci_Latn",
136
+ "West Central Oromo": "gaz_Latn",
137
+ "Odia": "ory_Orya",
138
+ "Pangasinan": "pag_Latn",
139
+ "Eastern Panjabi": "pan_Guru",
140
+ "Papiamento": "pap_Latn",
141
+ "Western Persian": "pes_Arab",
142
+ "Polish": "pol_Latn",
143
+ "Portuguese": "por_Latn",
144
+ "Dari": "prs_Arab",
145
+ "Southern Pashto": "pbt_Arab",
146
+ "Ayacucho Quechua": "quy_Latn",
147
+ "Romanian": "ron_Latn",
148
+ "Rundi": "run_Latn",
149
+ "Russian": "rus_Cyrl",
150
+ "Sango": "sag_Latn",
151
+ "Sanskrit": "san_Deva",
152
+ "Santali": "sat_Beng", # It is called sat_Olck in FLORES, but (less correctly sat_Beng in NLLB)
153
+ "Sicilian": "scn_Latn",
154
+ "Shan": "shn_Mymr",
155
+ "Sinhala": "sin_Sinh",
156
+ "Slovak": "slk_Latn",
157
+ "Slovenian": "slv_Latn",
158
+ "Samoan": "smo_Latn",
159
+ "Shona": "sna_Latn",
160
+ "Sindhi": "snd_Arab",
161
+ "Somali": "som_Latn",
162
+ "Southern Sotho": "sot_Latn",
163
+ "Spanish": "spa_Latn",
164
+ "Tosk Albanian": "als_Latn",
165
+ "Sardinian": "srd_Latn",
166
+ "Serbian": "srp_Cyrl",
167
+ "Swati": "ssw_Latn",
168
+ "Sundanese": "sun_Latn",
169
+ "Swedish": "swe_Latn",
170
+ "Swahili": "swh_Latn",
171
+ "Silesian": "szl_Latn",
172
+ "Tamil": "tam_Taml",
173
+ "Tatar": "tat_Cyrl",
174
+ "Telugu": "tel_Telu",
175
+ "Tajik": "tgk_Cyrl",
176
+ "Tagalog": "tgl_Latn",
177
+ "Thai": "tha_Thai",
178
+ "Tigrinya": "tir_Ethi",
179
+ "Tamasheq (Latin script)": "taq_Latn",
180
+ "Tamasheq (Tifinagh script)": "taq_Tfng",
181
+ "Tok Pisin": "tpi_Latn",
182
+ "Tswana": "tsn_Latn",
183
+ "Tsonga": "tso_Latn",
184
+ "Turkmen": "tuk_Latn",
185
+ "Tumbuka": "tum_Latn",
186
+ "Turkish": "tur_Latn",
187
+ "Twi": "twi_Latn",
188
+ "Central Atlas Tamazight": "tzm_Tfng",
189
+ "Uyghur": "uig_Arab",
190
+ "Ukrainian": "ukr_Cyrl",
191
+ "Umbundu": "umb_Latn",
192
+ "Urdu": "urd_Arab",
193
+ "Northern Uzbek": "uzn_Latn",
194
+ "Venetian": "vec_Latn",
195
+ "Vietnamese": "vie_Latn",
196
+ "Waray": "war_Latn",
197
+ "Wolof": "wol_Latn",
198
+ "Xhosa": "xho_Latn",
199
+ "Eastern Yiddish": "ydd_Hebr",
200
+ "Yoruba": "yor_Latn",
201
+ "Yue Chinese": "yue_Hant",
202
+ "Chinese (Simplified)": "zho_Hans",
203
+ "Chinese (Traditional)": "zho_Hant",
204
+ "Standard Malay": "zsm_Latn",
205
+ "Zulu": "zul_Latn",
206
+ }
gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ --extra-index-url https://download.pytorch.org/whl/cu113
2
+ transformers
3
+ torch
4
+ gradio==4.32.2
5
+ spaces
6
+ nltk
7
+ sacremoses
8
+ stopes[mono] @ git+https://github.com/facebookresearch/stopes@better-sentence-splitters