Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -15,6 +15,7 @@ non_empty_isos = df.slice(1).filter(pl.col("ISO639-1") != "").rows()
|
|
| 15 |
all_langs = {iso[0]: (iso[1], iso[2], iso[3]) for iso in non_empty_isos} # {'Romanian': ('ro', 'rum', 'ron')}
|
| 16 |
# iso1_to_name = {codes[0]: lang for entry in all_langs for lang, codes in entry.items()} # {'ro': 'Romanian', 'de': 'German'}
|
| 17 |
iso1_to_name = {iso[1]: iso[0] for iso in non_empty_isos} # {'ro': 'Romanian', 'de': 'German'}
|
|
|
|
| 18 |
langs = list(favourite_langs.keys())
|
| 19 |
langs.extend(list(all_langs.keys())) # Language options as list, add favourite languages first
|
| 20 |
|
|
@@ -24,14 +25,15 @@ models = ["Helsinki-NLP", "QUICKMT", "Argos", "Google", "HPLT", "HPLT-OPUS",
|
|
| 24 |
"Helsinki-NLP/opus-mt-tc-bible-big-roa-deu_eng_fra_por_spa", "Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-roa", "Helsinki-NLP/opus-mt-tc-bible-big-roa-en",
|
| 25 |
"facebook/nllb-200-distilled-600M", "facebook/nllb-200-distilled-1.3B", "facebook/nllb-200-1.3B", "facebook/nllb-200-3.3B",
|
| 26 |
"facebook/mbart-large-50-many-to-many-mmt", "facebook/mbart-large-50-one-to-many-mmt", "facebook/mbart-large-50-many-to-one-mmt",
|
| 27 |
-
"facebook/m2m100_418M", "facebook/m2m100_1.2B", "alirezamsh/small100",
|
|
|
|
| 28 |
"bigscience/mt0-small", "bigscience/mt0-base", "bigscience/mt0-large", "bigscience/mt0-xl",
|
| 29 |
"bigscience/bloomz-560m", "bigscience/bloomz-1b1", "bigscience/bloomz-1b7", "bigscience/bloomz-3b",
|
| 30 |
"google-t5/t5-small", "google-t5/t5-base", "google-t5/t5-large",
|
| 31 |
"google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl",
|
| 32 |
"google/madlad400-3b-mt", "jbochi/madlad400-3b-mt",
|
| 33 |
"NiuTrans/LMT-60-0.6B", "NiuTrans/LMT-60-1.7B", "NiuTrans/LMT-60-4B",
|
| 34 |
-
"HuggingFaceTB/SmolLM3-3B", "winninghealth/WiNGPT-Babel-2",
|
| 35 |
"utter-project/EuroLLM-1.7B", "utter-project/EuroLLM-1.7B-Instruct",
|
| 36 |
"Unbabel/Tower-Plus-2B", "Unbabel/TowerInstruct-7B-v0.2", "Unbabel/TowerInstruct-Mistral-7B-v0.2"
|
| 37 |
]
|
|
@@ -243,10 +245,9 @@ class Translators:
|
|
| 243 |
def HelsinkiNLP_mulroa(self):
|
| 244 |
try:
|
| 245 |
pipe = pipeline("translation", model=self.model_name, device=self.device)
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
translation
|
| 249 |
-
return translation[0]['translation_text'], f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {self.model_name}.'
|
| 250 |
except Exception as error:
|
| 251 |
return f"Error translating with model: {self.model_name}! Try other available language combination.", error
|
| 252 |
|
|
@@ -398,6 +399,26 @@ class Translators:
|
|
| 398 |
translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=self.sl, tgt_lang=self.tl)
|
| 399 |
translated_text = translator(self.input_text, max_length=512)
|
| 400 |
return translated_text[0]['translation_text']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 401 |
|
| 402 |
def wingpt(self):
|
| 403 |
model = AutoModelForCausalLM.from_pretrained(
|
|
@@ -585,6 +606,12 @@ def translate_text(input_text: str, s_language: str, t_language: str, model_name
|
|
| 585 |
elif model_name == "facebook/mbart-large-50-many-to-one-mmt":
|
| 586 |
translated_text = Translators(model_name, s_language, t_language, input_text).mbart_many_to_one()
|
| 587 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 588 |
elif model_name == "utter-project/EuroLLM-1.7B-Instruct":
|
| 589 |
translated_text = Translators(model_name, s_language, t_language, input_text).eurollm_instruct()
|
| 590 |
|
|
|
|
| 15 |
all_langs = {iso[0]: (iso[1], iso[2], iso[3]) for iso in non_empty_isos} # {'Romanian': ('ro', 'rum', 'ron')}
|
| 16 |
# iso1_to_name = {codes[0]: lang for entry in all_langs for lang, codes in entry.items()} # {'ro': 'Romanian', 'de': 'German'}
|
| 17 |
iso1_to_name = {iso[1]: iso[0] for iso in non_empty_isos} # {'ro': 'Romanian', 'de': 'German'}
|
| 18 |
+
iso1toall = {iso[1]: (iso[0], iso[2], iso[3] for iso in non_empty_isos} # {'ro': ('Romanian', 'rum', 'ron')}
|
| 19 |
langs = list(favourite_langs.keys())
|
| 20 |
langs.extend(list(all_langs.keys())) # Language options as list, add favourite languages first
|
| 21 |
|
|
|
|
| 25 |
"Helsinki-NLP/opus-mt-tc-bible-big-roa-deu_eng_fra_por_spa", "Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-roa", "Helsinki-NLP/opus-mt-tc-bible-big-roa-en",
|
| 26 |
"facebook/nllb-200-distilled-600M", "facebook/nllb-200-distilled-1.3B", "facebook/nllb-200-1.3B", "facebook/nllb-200-3.3B",
|
| 27 |
"facebook/mbart-large-50-many-to-many-mmt", "facebook/mbart-large-50-one-to-many-mmt", "facebook/mbart-large-50-many-to-one-mmt",
|
| 28 |
+
"facebook/m2m100_418M", "facebook/m2m100_1.2B", "alirezamsh/small100",
|
| 29 |
+
"facebook/hf-seamless-m4t-medium", "facebook/seamless-m4t-large", "facebook/seamless-m4t-v2-large",
|
| 30 |
"bigscience/mt0-small", "bigscience/mt0-base", "bigscience/mt0-large", "bigscience/mt0-xl",
|
| 31 |
"bigscience/bloomz-560m", "bigscience/bloomz-1b1", "bigscience/bloomz-1b7", "bigscience/bloomz-3b",
|
| 32 |
"google-t5/t5-small", "google-t5/t5-base", "google-t5/t5-large",
|
| 33 |
"google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl",
|
| 34 |
"google/madlad400-3b-mt", "jbochi/madlad400-3b-mt",
|
| 35 |
"NiuTrans/LMT-60-0.6B", "NiuTrans/LMT-60-1.7B", "NiuTrans/LMT-60-4B",
|
| 36 |
+
"Lego-MT/Lego-MT", "HuggingFaceTB/SmolLM3-3B", "winninghealth/WiNGPT-Babel-2",
|
| 37 |
"utter-project/EuroLLM-1.7B", "utter-project/EuroLLM-1.7B-Instruct",
|
| 38 |
"Unbabel/Tower-Plus-2B", "Unbabel/TowerInstruct-7B-v0.2", "Unbabel/TowerInstruct-Mistral-7B-v0.2"
|
| 39 |
]
|
|
|
|
| 245 |
def HelsinkiNLP_mulroa(self):
|
| 246 |
try:
|
| 247 |
pipe = pipeline("translation", model=self.model_name, device=self.device)
|
| 248 |
+
tgt_lang = iso1toall.get(self.tl)[2] # 'deu', 'ron', 'eng', 'fra'
|
| 249 |
+
translation = pipe(f'>>{tgt_lang}<< {self.input_text}')
|
| 250 |
+
return translation[0]['translation_text'], f'Translated from {iso1toall[self.sl][0]} to {iso1toall[self.tl][0]} with {self.model_name}.'
|
|
|
|
| 251 |
except Exception as error:
|
| 252 |
return f"Error translating with model: {self.model_name}! Try other available language combination.", error
|
| 253 |
|
|
|
|
| 399 |
translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=self.sl, tgt_lang=self.tl)
|
| 400 |
translated_text = translator(self.input_text, max_length=512)
|
| 401 |
return translated_text[0]['translation_text']
|
| 402 |
+
|
| 403 |
+
def seamlessm4t1(self):
|
| 404 |
+
from transformers import AutoProcessor, SeamlessM4TModel
|
| 405 |
+
processor = AutoProcessor.from_pretrained(self.model)
|
| 406 |
+
model = SeamlessM4TModel.from_pretrained(self.model)
|
| 407 |
+
src_lang = iso1toall.get(self.sl)[2] # 'deu', 'ron', 'eng', 'fra'
|
| 408 |
+
tgt_lang = iso1toall.get(self.tl)[2]
|
| 409 |
+
text_inputs = processor(text = self.input_text, src_lang=src_lang, return_tensors="pt")
|
| 410 |
+
output_tokens = model.generate(**text_inputs, tgt_lang=tgt_lang, generate_speech=False)
|
| 411 |
+
return processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
|
| 412 |
+
|
| 413 |
+
def seamlessm4t2(self):
|
| 414 |
+
from transformers import AutoProcessor, SeamlessM4Tv2ForTextToText
|
| 415 |
+
processor = AutoProcessor.from_pretrained(self.model)
|
| 416 |
+
model = SeamlessM4Tv2ForTextToText.from_pretrained(self.model)
|
| 417 |
+
src_lang = iso1toall.get(self.sl)[2] # 'deu', 'ron', 'eng', 'fra'
|
| 418 |
+
tgt_lang = iso1toall.get(self.tl)[2]
|
| 419 |
+
text_inputs = processor(text=self.input_text, src_lang=src_lang, return_tensors="pt")
|
| 420 |
+
decoder_input_ids = model.generate(**text_inputs, tgt_lang=tgt_lang)[0].tolist()
|
| 421 |
+
return processor.decode(decoder_input_ids, skip_special_tokens=True)
|
| 422 |
|
| 423 |
def wingpt(self):
|
| 424 |
model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
| 606 |
elif model_name == "facebook/mbart-large-50-many-to-one-mmt":
|
| 607 |
translated_text = Translators(model_name, s_language, t_language, input_text).mbart_many_to_one()
|
| 608 |
|
| 609 |
+
elif model_name == "facebook/seamless-m4t-v2-large":
|
| 610 |
+
translated_text = Translators(model_name, s_language, t_language, input_text).seamlessm4t2()
|
| 611 |
+
|
| 612 |
+
elif "m4t-medium" in model_name or "m4t-large" in model_name:
|
| 613 |
+
translated_text = Translators(model_name, s_language, t_language, input_text).seamlessm4t1()
|
| 614 |
+
|
| 615 |
elif model_name == "utter-project/EuroLLM-1.7B-Instruct":
|
| 616 |
translated_text = Translators(model_name, s_language, t_language, input_text).eurollm_instruct()
|
| 617 |
|