TiberiuCristianLeon commited on
Commit
2f20105
·
verified ·
1 Parent(s): 4e0ca09

Removed "Unbabel/TowerInstruct-Mistral-7B-v0.2", added "BSC-LT/salamandraTA-2b-instruct"

Browse files
Files changed (1) hide show
  1. app.py +33 -24
app.py CHANGED
@@ -27,14 +27,14 @@ models = ["Helsinki-NLP", "QUICKMT", "Argos", "Google", "HPLT", "HPLT-OPUS",
27
  "facebook/hf-seamless-m4t-medium", "facebook/seamless-m4t-large", "facebook/seamless-m4t-v2-large",
28
  "bigscience/mt0-small", "bigscience/mt0-base", "bigscience/mt0-large", "bigscience/mt0-xl",
29
  "bigscience/bloomz-560m", "bigscience/bloomz-1b1", "bigscience/bloomz-1b7", "bigscience/bloomz-3b",
30
- "google-t5/t5-small", "google-t5/t5-base", "google-t5/t5-large",
31
- "google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl",
32
  "google/madlad400-3b-mt", "jbochi/madlad400-3b-mt",
33
  "NiuTrans/LMT-60-0.6B", "NiuTrans/LMT-60-1.7B", "NiuTrans/LMT-60-4B",
34
- "Lego-MT/Lego-MT", "HuggingFaceTB/SmolLM3-3B",
35
  "winninghealth/WiNGPT-Babel", "winninghealth/WiNGPT-Babel-2", "winninghealth/WiNGPT-Babel-2.1",
 
36
  "utter-project/EuroLLM-1.7B", "utter-project/EuroLLM-1.7B-Instruct",
37
- "Unbabel/Tower-Plus-2B", "Unbabel/TowerInstruct-7B-v0.2", "Unbabel/TowerInstruct-Mistral-7B-v0.2"
 
38
  ]
39
  DEFAULTS = [langs[0], langs[1], models[0]]
40
 
@@ -72,6 +72,7 @@ class Translators:
72
  self.sl, self.tl = sl, tl
73
  self.input_text = input_text
74
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
75
 
76
  def google(self):
77
  url = os.environ['GCLIENT'] + f'sl={self.sl}&tl={self.tl}&q={self.input_text}'
@@ -104,16 +105,11 @@ class Translators:
104
  outputs = ''.join(outputs) if isinstance(outputs, list) else outputs
105
  return outputs
106
 
107
- def smallonehundred(self):
108
- from transformers import M2M100ForConditionalGeneration
109
- from tokenization_small100 import SMALL100Tokenizer
110
- model = M2M100ForConditionalGeneration.from_pretrained(self.model_name)
111
- tokenizer = SMALL100Tokenizer.from_pretrained(self.model_name)
112
- tokenizer.tgt_lang = self.tl
113
- encoded_sl = tokenizer(self.input_text, return_tensors="pt")
114
- generated_tokens = model.generate(**encoded_sl, max_length=256, num_beams=5)
115
- return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
116
-
117
  def hplt(self, opus = False):
118
  # langs = ['ar', 'bs', 'ca', 'en', 'et', 'eu', 'fi', 'ga', 'gl', 'hi', 'hr', 'is', 'mt', 'nn', 'sq', 'sw', 'zh_hant']
119
  hplt_models = ['ar-en', 'bs-en', 'ca-en', 'en-ar', 'en-bs', 'en-ca', 'en-et', 'en-eu', 'en-fi',
@@ -267,16 +263,7 @@ class Translators:
267
  return self.HelsinkiNLP_mulroa()
268
  except KeyError as error:
269
  return f"Error: Translation direction {self.sl} to {self.tl} is not supported by Helsinki Translation Models", error
270
-
271
- def LegoMT(self):
272
- from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
273
- model = M2M100ForConditionalGeneration.from_pretrained(self.model_name) # "Lego-MT/Lego-MT"
274
- tokenizer = M2M100Tokenizer.from_pretrained(self.model_name)
275
- tokenizer.src_lang = self.sl
276
- encoded = tokenizer(self.input_text, return_tensors="pt")
277
- generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(self.tl))
278
- return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
279
-
280
  def madlad(self):
281
  model = T5ForConditionalGeneration.from_pretrained(self.model_name, device_map="auto")
282
  tokenizer = T5Tokenizer.from_pretrained(self.model_name)
@@ -369,6 +356,25 @@ class Translators:
369
  generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(self.tl))
370
  return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
371
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  def bigscience(self):
373
  tokenizer = AutoTokenizer.from_pretrained(self.model_name)
374
  model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
@@ -575,6 +581,9 @@ def translate_text(input_text: str, s_language: str, t_language: str, model_name
575
  elif "niutrans" in model_name.lower():
576
  translated_text = Translators(model_name, sl, tl, input_text).niutrans()
577
 
 
 
 
578
  elif model_name.startswith('google-t5'):
579
  translated_text = Translators(model_name, s_language, t_language, input_text).tfive()
580
 
 
27
  "facebook/hf-seamless-m4t-medium", "facebook/seamless-m4t-large", "facebook/seamless-m4t-v2-large",
28
  "bigscience/mt0-small", "bigscience/mt0-base", "bigscience/mt0-large", "bigscience/mt0-xl",
29
  "bigscience/bloomz-560m", "bigscience/bloomz-1b1", "bigscience/bloomz-1b7", "bigscience/bloomz-3b",
 
 
30
  "google/madlad400-3b-mt", "jbochi/madlad400-3b-mt",
31
  "NiuTrans/LMT-60-0.6B", "NiuTrans/LMT-60-1.7B", "NiuTrans/LMT-60-4B",
32
+ "Lego-MT/Lego-MT", "BSC-LT/salamandraTA-2b-instruct",
33
  "winninghealth/WiNGPT-Babel", "winninghealth/WiNGPT-Babel-2", "winninghealth/WiNGPT-Babel-2.1",
34
+ "Unbabel/Tower-Plus-2B", "HuggingFaceTB/SmolLM3-3B", "Unbabel/TowerInstruct-7B-v0.2",
35
  "utter-project/EuroLLM-1.7B", "utter-project/EuroLLM-1.7B-Instruct",
36
+ "google-t5/t5-small", "google-t5/t5-base", "google-t5/t5-large",
37
+ "google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl"
38
  ]
39
  DEFAULTS = [langs[0], langs[1], models[0]]
40
 
 
72
  self.sl, self.tl = sl, tl
73
  self.input_text = input_text
74
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
75
+ self.max_new_tokens = 512
76
 
77
  def google(self):
78
  url = os.environ['GCLIENT'] + f'sl={self.sl}&tl={self.tl}&q={self.input_text}'
 
105
  outputs = ''.join(outputs) if isinstance(outputs, list) else outputs
106
  return outputs
107
 
108
+ def salamandratapipe(self):
109
+ pipe = pipeline("text-generation", model=self.model_name)
110
+ messages = [{"role": "user", "content": f"Translate the following text from {self.sl} into {self.tl}.\n{self.sl}: {self.input_text} \n{self.tl}:"}]
111
+ return pipe(messages, max_new_tokens=self.max_new_tokens, early_stopping=True, num_beams=5)[0]["generated_text"][1]["content"]
112
+
 
 
 
 
 
113
  def hplt(self, opus = False):
114
  # langs = ['ar', 'bs', 'ca', 'en', 'et', 'eu', 'fi', 'ga', 'gl', 'hi', 'hr', 'is', 'mt', 'nn', 'sq', 'sw', 'zh_hant']
115
  hplt_models = ['ar-en', 'bs-en', 'ca-en', 'en-ar', 'en-bs', 'en-ca', 'en-et', 'en-eu', 'en-fi',
 
263
  return self.HelsinkiNLP_mulroa()
264
  except KeyError as error:
265
  return f"Error: Translation direction {self.sl} to {self.tl} is not supported by Helsinki Translation Models", error
266
+
 
 
 
 
 
 
 
 
 
267
  def madlad(self):
268
  model = T5ForConditionalGeneration.from_pretrained(self.model_name, device_map="auto")
269
  tokenizer = T5Tokenizer.from_pretrained(self.model_name)
 
356
  generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(self.tl))
357
  return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
358
 
359
+ def smallonehundred(self):
360
+ from transformers import M2M100ForConditionalGeneration
361
+ from tokenization_small100 import SMALL100Tokenizer
362
+ model = M2M100ForConditionalGeneration.from_pretrained(self.model_name)
363
+ tokenizer = SMALL100Tokenizer.from_pretrained(self.model_name)
364
+ tokenizer.tgt_lang = self.tl
365
+ encoded_sl = tokenizer(self.input_text, return_tensors="pt")
366
+ generated_tokens = model.generate(**encoded_sl, max_length=256, num_beams=5)
367
+ return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
368
+
369
+ def LegoMT(self):
370
+ from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
371
+ model = M2M100ForConditionalGeneration.from_pretrained(self.model_name) # "Lego-MT/Lego-MT"
372
+ tokenizer = M2M100Tokenizer.from_pretrained(self.model_name)
373
+ tokenizer.src_lang = self.sl
374
+ encoded = tokenizer(self.input_text, return_tensors="pt")
375
+ generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(self.tl))
376
+ return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
377
+
378
  def bigscience(self):
379
  tokenizer = AutoTokenizer.from_pretrained(self.model_name)
380
  model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
 
581
  elif "niutrans" in model_name.lower():
582
  translated_text = Translators(model_name, sl, tl, input_text).niutrans()
583
 
584
+ elif "salamandra" in model_name.lower():
585
+ translated_text = Translators(model_name, s_language, t_language, input_text).salamandratapipe()
586
+
587
  elif model_name.startswith('google-t5'):
588
  translated_text = Translators(model_name, s_language, t_language, input_text).tfive()
589