Spaces:
Running
Running
Removed "Unbabel/TowerInstruct-Mistral-7B-v0.2", added "BSC-LT/salamandraTA-2b-instruct"
Browse files
app.py
CHANGED
|
@@ -27,14 +27,14 @@ models = ["Helsinki-NLP", "QUICKMT", "Argos", "Google", "HPLT", "HPLT-OPUS",
|
|
| 27 |
"facebook/hf-seamless-m4t-medium", "facebook/seamless-m4t-large", "facebook/seamless-m4t-v2-large",
|
| 28 |
"bigscience/mt0-small", "bigscience/mt0-base", "bigscience/mt0-large", "bigscience/mt0-xl",
|
| 29 |
"bigscience/bloomz-560m", "bigscience/bloomz-1b1", "bigscience/bloomz-1b7", "bigscience/bloomz-3b",
|
| 30 |
-
"google-t5/t5-small", "google-t5/t5-base", "google-t5/t5-large",
|
| 31 |
-
"google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl",
|
| 32 |
"google/madlad400-3b-mt", "jbochi/madlad400-3b-mt",
|
| 33 |
"NiuTrans/LMT-60-0.6B", "NiuTrans/LMT-60-1.7B", "NiuTrans/LMT-60-4B",
|
| 34 |
-
"Lego-MT/Lego-MT", "
|
| 35 |
"winninghealth/WiNGPT-Babel", "winninghealth/WiNGPT-Babel-2", "winninghealth/WiNGPT-Babel-2.1",
|
|
|
|
| 36 |
"utter-project/EuroLLM-1.7B", "utter-project/EuroLLM-1.7B-Instruct",
|
| 37 |
-
"
|
|
|
|
| 38 |
]
|
| 39 |
DEFAULTS = [langs[0], langs[1], models[0]]
|
| 40 |
|
|
@@ -72,6 +72,7 @@ class Translators:
|
|
| 72 |
self.sl, self.tl = sl, tl
|
| 73 |
self.input_text = input_text
|
| 74 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
| 75 |
|
| 76 |
def google(self):
|
| 77 |
url = os.environ['GCLIENT'] + f'sl={self.sl}&tl={self.tl}&q={self.input_text}'
|
|
@@ -104,16 +105,11 @@ class Translators:
|
|
| 104 |
outputs = ''.join(outputs) if isinstance(outputs, list) else outputs
|
| 105 |
return outputs
|
| 106 |
|
| 107 |
-
def
|
| 108 |
-
|
| 109 |
-
from
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
tokenizer.tgt_lang = self.tl
|
| 113 |
-
encoded_sl = tokenizer(self.input_text, return_tensors="pt")
|
| 114 |
-
generated_tokens = model.generate(**encoded_sl, max_length=256, num_beams=5)
|
| 115 |
-
return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
| 116 |
-
|
| 117 |
def hplt(self, opus = False):
|
| 118 |
# langs = ['ar', 'bs', 'ca', 'en', 'et', 'eu', 'fi', 'ga', 'gl', 'hi', 'hr', 'is', 'mt', 'nn', 'sq', 'sw', 'zh_hant']
|
| 119 |
hplt_models = ['ar-en', 'bs-en', 'ca-en', 'en-ar', 'en-bs', 'en-ca', 'en-et', 'en-eu', 'en-fi',
|
|
@@ -267,16 +263,7 @@ class Translators:
|
|
| 267 |
return self.HelsinkiNLP_mulroa()
|
| 268 |
except KeyError as error:
|
| 269 |
return f"Error: Translation direction {self.sl} to {self.tl} is not supported by Helsinki Translation Models", error
|
| 270 |
-
|
| 271 |
-
def LegoMT(self):
|
| 272 |
-
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
|
| 273 |
-
model = M2M100ForConditionalGeneration.from_pretrained(self.model_name) # "Lego-MT/Lego-MT"
|
| 274 |
-
tokenizer = M2M100Tokenizer.from_pretrained(self.model_name)
|
| 275 |
-
tokenizer.src_lang = self.sl
|
| 276 |
-
encoded = tokenizer(self.input_text, return_tensors="pt")
|
| 277 |
-
generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(self.tl))
|
| 278 |
-
return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
| 279 |
-
|
| 280 |
def madlad(self):
|
| 281 |
model = T5ForConditionalGeneration.from_pretrained(self.model_name, device_map="auto")
|
| 282 |
tokenizer = T5Tokenizer.from_pretrained(self.model_name)
|
|
@@ -369,6 +356,25 @@ class Translators:
|
|
| 369 |
generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(self.tl))
|
| 370 |
return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
| 371 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
def bigscience(self):
|
| 373 |
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 374 |
model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
|
|
@@ -575,6 +581,9 @@ def translate_text(input_text: str, s_language: str, t_language: str, model_name
|
|
| 575 |
elif "niutrans" in model_name.lower():
|
| 576 |
translated_text = Translators(model_name, sl, tl, input_text).niutrans()
|
| 577 |
|
|
|
|
|
|
|
|
|
|
| 578 |
elif model_name.startswith('google-t5'):
|
| 579 |
translated_text = Translators(model_name, s_language, t_language, input_text).tfive()
|
| 580 |
|
|
|
|
| 27 |
"facebook/hf-seamless-m4t-medium", "facebook/seamless-m4t-large", "facebook/seamless-m4t-v2-large",
|
| 28 |
"bigscience/mt0-small", "bigscience/mt0-base", "bigscience/mt0-large", "bigscience/mt0-xl",
|
| 29 |
"bigscience/bloomz-560m", "bigscience/bloomz-1b1", "bigscience/bloomz-1b7", "bigscience/bloomz-3b",
|
|
|
|
|
|
|
| 30 |
"google/madlad400-3b-mt", "jbochi/madlad400-3b-mt",
|
| 31 |
"NiuTrans/LMT-60-0.6B", "NiuTrans/LMT-60-1.7B", "NiuTrans/LMT-60-4B",
|
| 32 |
+
"Lego-MT/Lego-MT", "BSC-LT/salamandraTA-2b-instruct",
|
| 33 |
"winninghealth/WiNGPT-Babel", "winninghealth/WiNGPT-Babel-2", "winninghealth/WiNGPT-Babel-2.1",
|
| 34 |
+
"Unbabel/Tower-Plus-2B", "HuggingFaceTB/SmolLM3-3B", "Unbabel/TowerInstruct-7B-v0.2",
|
| 35 |
"utter-project/EuroLLM-1.7B", "utter-project/EuroLLM-1.7B-Instruct",
|
| 36 |
+
"google-t5/t5-small", "google-t5/t5-base", "google-t5/t5-large",
|
| 37 |
+
"google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl"
|
| 38 |
]
|
| 39 |
DEFAULTS = [langs[0], langs[1], models[0]]
|
| 40 |
|
|
|
|
| 72 |
self.sl, self.tl = sl, tl
|
| 73 |
self.input_text = input_text
|
| 74 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 75 |
+
self.max_new_tokens = 512
|
| 76 |
|
| 77 |
def google(self):
|
| 78 |
url = os.environ['GCLIENT'] + f'sl={self.sl}&tl={self.tl}&q={self.input_text}'
|
|
|
|
| 105 |
outputs = ''.join(outputs) if isinstance(outputs, list) else outputs
|
| 106 |
return outputs
|
| 107 |
|
| 108 |
+
def salamandratapipe(self):
|
| 109 |
+
pipe = pipeline("text-generation", model=self.model_name)
|
| 110 |
+
messages = [{"role": "user", "content": f"Translate the following text from {self.sl} into {self.tl}.\n{self.sl}: {self.input_text} \n{self.tl}:"}]
|
| 111 |
+
return pipe(messages, max_new_tokens=self.max_new_tokens, early_stopping=True, num_beams=5)[0]["generated_text"][1]["content"]
|
| 112 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
def hplt(self, opus = False):
|
| 114 |
# langs = ['ar', 'bs', 'ca', 'en', 'et', 'eu', 'fi', 'ga', 'gl', 'hi', 'hr', 'is', 'mt', 'nn', 'sq', 'sw', 'zh_hant']
|
| 115 |
hplt_models = ['ar-en', 'bs-en', 'ca-en', 'en-ar', 'en-bs', 'en-ca', 'en-et', 'en-eu', 'en-fi',
|
|
|
|
| 263 |
return self.HelsinkiNLP_mulroa()
|
| 264 |
except KeyError as error:
|
| 265 |
return f"Error: Translation direction {self.sl} to {self.tl} is not supported by Helsinki Translation Models", error
|
| 266 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
def madlad(self):
|
| 268 |
model = T5ForConditionalGeneration.from_pretrained(self.model_name, device_map="auto")
|
| 269 |
tokenizer = T5Tokenizer.from_pretrained(self.model_name)
|
|
|
|
| 356 |
generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(self.tl))
|
| 357 |
return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
| 358 |
|
| 359 |
+
def smallonehundred(self):
|
| 360 |
+
from transformers import M2M100ForConditionalGeneration
|
| 361 |
+
from tokenization_small100 import SMALL100Tokenizer
|
| 362 |
+
model = M2M100ForConditionalGeneration.from_pretrained(self.model_name)
|
| 363 |
+
tokenizer = SMALL100Tokenizer.from_pretrained(self.model_name)
|
| 364 |
+
tokenizer.tgt_lang = self.tl
|
| 365 |
+
encoded_sl = tokenizer(self.input_text, return_tensors="pt")
|
| 366 |
+
generated_tokens = model.generate(**encoded_sl, max_length=256, num_beams=5)
|
| 367 |
+
return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
| 368 |
+
|
| 369 |
+
def LegoMT(self):
|
| 370 |
+
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
|
| 371 |
+
model = M2M100ForConditionalGeneration.from_pretrained(self.model_name) # "Lego-MT/Lego-MT"
|
| 372 |
+
tokenizer = M2M100Tokenizer.from_pretrained(self.model_name)
|
| 373 |
+
tokenizer.src_lang = self.sl
|
| 374 |
+
encoded = tokenizer(self.input_text, return_tensors="pt")
|
| 375 |
+
generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(self.tl))
|
| 376 |
+
return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
| 377 |
+
|
| 378 |
def bigscience(self):
|
| 379 |
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 380 |
model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
|
|
|
|
| 581 |
elif "niutrans" in model_name.lower():
|
| 582 |
translated_text = Translators(model_name, sl, tl, input_text).niutrans()
|
| 583 |
|
| 584 |
+
elif "salamandra" in model_name.lower():
|
| 585 |
+
translated_text = Translators(model_name, s_language, t_language, input_text).salamandratapipe()
|
| 586 |
+
|
| 587 |
elif model_name.startswith('google-t5'):
|
| 588 |
translated_text = Translators(model_name, s_language, t_language, input_text).tfive()
|
| 589 |
|