santakan commited on
Commit
06a30c8
ยท
verified ยท
1 Parent(s): 61e9405

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -40
app.py CHANGED
@@ -1,23 +1,29 @@
1
  import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
3
  import torch
 
4
 
5
  # 1. ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ์„ค์ •
6
- # Hugging Face์˜ ๋ฌด๋ฃŒ CPU ํ™˜๊ฒฝ์—์„œ๋„ ๋Œ์•„๊ฐˆ ์ˆ˜ ์žˆ๋Š” ๊ฐ€๋ฒผ์šด ๋ชจ๋ธ(600M)์„ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
7
- # ์ฒ˜์Œ ์‹คํ–‰ ์‹œ ๋ชจ๋ธ์„ ๋‹ค์šด๋กœ๋“œํ•˜๋ฏ€๋กœ ์‹œ๊ฐ„์ด ์กฐ๊ธˆ ๊ฑธ๋ฆด ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
8
  model_name = "facebook/nllb-200-distilled-600M"
9
 
10
- print("๋ชจ๋ธ์„ ๋กœ๋“œํ•˜๋Š” ์ค‘์ž…๋‹ˆ๋‹ค... ์ž ์‹œ๋งŒ ๊ธฐ๋‹ค๋ ค์ฃผ์„ธ์š”.")
 
 
 
 
 
11
  try:
 
12
  tokenizer = AutoTokenizer.from_pretrained(model_name)
13
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
14
- translator = pipeline("translation", model=model, tokenizer=tokenizer, src_lang="eng_Latn", tgt_lang="kor_Hang", max_length=400)
15
  print("๋ชจ๋ธ ๋กœ๋“œ ์™„๋ฃŒ!")
16
  except Exception as e:
17
- print(f"๋ชจ๋ธ ๋กœ๋“œ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
 
 
18
 
19
- # 2. ์–ธ์–ด ์ฝ”๋“œ ๋งคํ•‘ (NLLB ๋ชจ๋ธ ์ „์šฉ ์ฝ”๋“œ)
20
- # NLLB ๋ชจ๋ธ์€ ์ผ๋ฐ˜์ ์ธ 'en', 'ja' ๋Œ€์‹  200๊ฐœ ์–ธ์–ด์— ๋Œ€ํ•œ ๊ณ ์œ  ์ฝ”๋“œ๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
21
  LANG_CODES = {
22
  "์˜์–ด (English)": "eng_Latn",
23
  "์ผ๋ณธ์–ด (Japanese)": "jpn_Jpan",
@@ -28,29 +34,31 @@ TARGET_LANG_CODE = "kor_Hang" # ํ•œ๊ตญ์–ด
28
 
29
  def translate_text(text, source_lang_name):
30
  """
31
- ์‚ฌ์šฉ์ž์˜ ์ž…๋ ฅ ํ…์ŠคํŠธ์™€ ์„ ํƒํ•œ ์–ธ์–ด๋ฅผ ๋ฐ›์•„ ํ•œ๊ตญ์–ด๋กœ ๋ฒˆ์—ญํ•ฉ๋‹ˆ๋‹ค.
32
  """
33
  if not text:
34
  return "๋ฒˆ์—ญํ•  ๋‚ด์šฉ์„ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”."
35
 
36
- # ์„ ํƒ๋œ ์–ธ์–ด ์ด๋ฆ„์„ NLLB ์ฝ”๋“œ๋กœ ๋ณ€ํ™˜
37
- src_code = LANG_CODES.get(source_lang_name)
38
-
39
- # ๋ฒˆ์—ญ ์ˆ˜ํ–‰
40
- # NLLB ๋ชจ๋ธ์€ ์ž…๋ ฅ ์–ธ์–ด ์ฝ”๋“œ๋ฅผ ๋ช…์‹œํ•ด์•ผ ์ •ํ™•๋„๊ฐ€ ๋†’์Šต๋‹ˆ๋‹ค.
41
- # pipeline์„ ์‚ฌ์šฉํ•  ๋•Œ๋Š” ํŒŒ์ดํ”„๋ผ์ธ ์ƒ์„ฑ ์‹œ๋‚˜ ํ˜ธ์ถœ ์‹œ src_lang์„ ์ง€์ •ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
42
- # ์—ฌ๊ธฐ์„œ๋Š” ํ˜ธ์ถœํ•  ๋•Œ ์ง์ ‘ ํ† ํฌ๋‚˜์ด์ € ์„ค์ •์„ ์กฐ์ •ํ•˜์—ฌ ๋ฒˆ์—ญํ•ฉ๋‹ˆ๋‹ค.
43
-
44
  try:
45
- # ์ž…๋ ฅ ํ…์ŠคํŠธ ํ† ํฐํ™” (์–ธ์–ด ์ฝ”๋“œ ์ง€์ •)
 
 
 
46
  tokenizer.src_lang = src_code
 
 
47
  inputs = tokenizer(text, return_tensors="pt")
48
 
49
- # ๋ชจ๋ธ์„ ํ†ตํ•ด ๋ฒˆ์—ญ ์ƒ์„ฑ (๋ชฉํ‘œ ์–ธ์–ด: ํ•œ๊ตญ์–ด)
50
  generated_tokens = model.generate(
51
  **inputs,
52
  forced_bos_token_id=tokenizer.lang_code_to_id[TARGET_LANG_CODE],
53
- max_length=500
 
 
54
  )
55
 
56
  # ๊ฒฐ๊ณผ ๋””์ฝ”๋”ฉ
@@ -58,53 +66,50 @@ def translate_text(text, source_lang_name):
58
  return result
59
 
60
  except Exception as e:
61
- return f"๋ฒˆ์—ญ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
62
 
63
- # 3. Gradio ์ธํ„ฐํŽ˜์ด์Šค ๊ตฌ์„ฑ
64
- # ์‚ฌ์šฉ์ž๊ฐ€ ๋ณด๊ฒŒ ๋  ์›น UI ๋””์ž์ธ์ž…๋‹ˆ๋‹ค.
65
- with gr.Blocks(title="๋‹ค๊ตญ์–ด ํ•œ๊ตญ์–ด ๋ฒˆ์—ญ๊ธฐ") as demo:
66
  gr.Markdown(
67
  """
68
- # ๐ŸŒ ๋‹ค๊ตญ์–ด -> ํ•œ๊ตญ์–ด ๋ฒˆ์—ญ๊ธฐ
69
- **์˜์–ด, ์ผ๋ณธ์–ด, ์ค‘๊ตญ์–ด**๋ฅผ ์ž…๋ ฅํ•˜๋ฉด **ํ•œ๊ตญ์–ด**๋กœ ๋ฒˆ์—ญํ•ด ์ค๋‹ˆ๋‹ค.
70
- (Meta์˜ NLLB-200 ๋ชจ๋ธ ๊ธฐ๋ฐ˜)
71
  """
72
  )
73
 
74
  with gr.Row():
75
  with gr.Column():
76
- # ์ž…๋ ฅ ์„ค์ •
77
  src_lang = gr.Dropdown(
78
  choices=list(LANG_CODES.keys()),
79
  value="์˜์–ด (English)",
80
- label="์ž…๋ ฅ ์–ธ์–ด ์„ ํƒ"
81
  )
82
  input_text = gr.Textbox(
83
  lines=5,
84
- placeholder="๋ฒˆ์—ญํ•  ๋‚ด์šฉ์„ ์—ฌ๊ธฐ์— ์ž…๋ ฅํ•˜์„ธ์š”...",
85
- label="์ž…๋ ฅ (Input)"
86
  )
87
- translate_btn = gr.Button("ํ•œ๊ตญ์–ด๋กœ ๋ฒˆ์—ญํ•˜๊ธฐ", variant="primary")
88
 
89
  with gr.Column():
90
- # ์ถœ๋ ฅ ์„ค์ •
91
  output_text = gr.Textbox(
92
  lines=5,
93
- label="ํ•œ๊ตญ์–ด ๋ฒˆ์—ญ ๊ฒฐ๊ณผ (Result)",
94
- interactive=False
 
95
  )
96
 
97
- # ์˜ˆ์ œ ์ž…๋ ฅ ์ œ๊ณต
98
  gr.Examples(
99
  examples=[
100
- ["Hello, how are you today?", "์˜์–ด (English)"],
101
- ["ใ“ใ‚“ใซใกใฏใ€ๅ…ƒๆฐ—ใงใ™ใ‹๏ผŸ", "์ผ๋ณธ์–ด (Japanese)"],
102
- ["ไฝ ๅฅฝ๏ผŒๅพˆ้ซ˜ๅ…ด่งๅˆฐไฝ ใ€‚", "์ค‘๊ตญ์–ด (Chinese Simplified)"]
103
  ],
104
  inputs=[input_text, src_lang]
105
  )
106
 
107
- # ๋ฒ„ํŠผ ํด๋ฆญ ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ
108
  translate_btn.click(
109
  fn=translate_text,
110
  inputs=[input_text, src_lang],
 
1
  import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
  import torch
4
+ import sys
5
 
6
  # 1. ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ์„ค์ •
7
+ # Hugging Face Spaces์˜ ๋ฌด๋ฃŒ CPU ํ™˜๊ฒฝ(16GB RAM)์— ๋งž์ถฐ 600M ๋ชจ๋ธ ์‚ฌ์šฉ
 
8
  model_name = "facebook/nllb-200-distilled-600M"
9
 
10
+ print(f"๋ชจ๋ธ({model_name})์„ ๋กœ๋“œํ•˜๋Š” ์ค‘์ž…๋‹ˆ๋‹ค... ์ž ์‹œ๋งŒ ๊ธฐ๋‹ค๋ ค์ฃผ์„ธ์š”.")
11
+
12
+ # ์ „์—ญ ๋ณ€์ˆ˜๋กœ ์„ ์–ธ
13
+ tokenizer = None
14
+ model = None
15
+
16
  try:
17
+ # ํ† ํฌ๋‚˜์ด์ €์™€ ๋ชจ๋ธ ๋กœ๋“œ
18
  tokenizer = AutoTokenizer.from_pretrained(model_name)
19
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
 
20
  print("๋ชจ๋ธ ๋กœ๋“œ ์™„๋ฃŒ!")
21
  except Exception as e:
22
+ # ๋ชจ๋ธ ๋กœ๋“œ ์‹คํŒจ ์‹œ ์•ฑ์„ ๊ฐ•์ œ๋กœ ์ข…๋ฃŒํ•˜์—ฌ Logs ํƒญ์—์„œ ์ •ํ™•ํ•œ ์›์ธ์„ ๋ณผ ์ˆ˜ ์žˆ๊ฒŒ ํ•จ
23
+ print(f"โŒ ๋ชจ๋ธ ๋กœ๋“œ ์ค‘ ์น˜๋ช…์ ์ธ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
24
+ sys.exit(1)
25
 
26
+ # 2. ์–ธ์–ด ์ฝ”๋“œ ๋งคํ•‘
 
27
  LANG_CODES = {
28
  "์˜์–ด (English)": "eng_Latn",
29
  "์ผ๋ณธ์–ด (Japanese)": "jpn_Jpan",
 
34
 
35
  def translate_text(text, source_lang_name):
36
  """
37
+ ์ž…๋ ฅ ํ…์ŠคํŠธ๋ฅผ ํ•œ๊ตญ์–ด๋กœ ๋ฒˆ์—ญ
38
  """
39
  if not text:
40
  return "๋ฒˆ์—ญํ•  ๋‚ด์šฉ์„ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”."
41
 
42
+ if model is None or tokenizer is None:
43
+ return "๋ชจ๋ธ์ด ๋กœ๋“œ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. ์„œ๋ฒ„ ๋กœ๊ทธ๋ฅผ ํ™•์ธํ•ด์ฃผ์„ธ์š”."
44
+
 
 
 
 
 
45
  try:
46
+ # ์ž…๋ ฅ ์–ธ์–ด ์ฝ”๋“œ ๊ฐ€์ ธ์˜ค๊ธฐ
47
+ src_code = LANG_CODES.get(source_lang_name)
48
+
49
+ # ๋ฒˆ์—ญ ์˜ต์…˜ ์„ค์ •: ์ž…๋ ฅ ์–ธ์–ด ์ง€์ •
50
  tokenizer.src_lang = src_code
51
+
52
+ # ์ž…๋ ฅ ํ…์ŠคํŠธ ํ† ํฐํ™”
53
  inputs = tokenizer(text, return_tensors="pt")
54
 
55
+ # ๋ชจ๋ธ ์ถ”๋ก  (ํ•œ๊ตญ์–ด๋กœ ์ƒ์„ฑ ๊ฐ•์ œ)
56
  generated_tokens = model.generate(
57
  **inputs,
58
  forced_bos_token_id=tokenizer.lang_code_to_id[TARGET_LANG_CODE],
59
+ max_length=500,
60
+ num_beams=4, # ๋ฒˆ์—ญ ํ’ˆ์งˆ์„ ์œ„ํ•ด ๋น” ์„œ์น˜ ์‚ฌ์šฉ (์•ฝ๊ฐ„ ๋А๋ ค์งˆ ์ˆ˜ ์žˆ์Œ)
61
+ early_stopping=True
62
  )
63
 
64
  # ๊ฒฐ๊ณผ ๋””์ฝ”๋”ฉ
 
66
  return result
67
 
68
  except Exception as e:
69
+ return f"๋ฒˆ์—ญ ์—๋Ÿฌ: {str(e)}"
70
 
71
+ # 3. Gradio ์ธํ„ฐํŽ˜์ด์Šค
72
+ with gr.Blocks(title="ํ•œ๊ธ€๋กœ (Hangullo) - ๋‹ค๊ตญ์–ด ๋ฒˆ์—ญ๊ธฐ") as demo:
 
73
  gr.Markdown(
74
  """
75
+ # ๐Ÿ‡ฐ๐Ÿ‡ท ํ•œ๊ธ€๋กœ (Hangullo)
76
+ **์˜์–ด, ์ผ๋ณธ์–ด, ์ค‘๊ตญ์–ด**๋ฅผ ์ž…๋ ฅํ•˜๋ฉด ์ž์—ฐ์Šค๋Ÿฌ์šด **ํ•œ๊ตญ์–ด**๋กœ ๋ฒˆ์—ญํ•ด ๋“œ๋ฆฝ๋‹ˆ๋‹ค.
77
+ *(Powered by Meta NLLB-200)*
78
  """
79
  )
80
 
81
  with gr.Row():
82
  with gr.Column():
 
83
  src_lang = gr.Dropdown(
84
  choices=list(LANG_CODES.keys()),
85
  value="์˜์–ด (English)",
86
+ label="์ž…๋ ฅ ์–ธ์–ด"
87
  )
88
  input_text = gr.Textbox(
89
  lines=5,
90
+ placeholder="๋ฒˆ์—ญํ•  ๋ฌธ์žฅ์„ ์ž…๋ ฅํ•˜์„ธ์š”...",
91
+ label="์ž…๋ ฅ (Source)"
92
  )
93
+ translate_btn = gr.Button("ํ•œ๊ตญ์–ด๋กœ ๋ณ€ํ™˜", variant="primary")
94
 
95
  with gr.Column():
 
96
  output_text = gr.Textbox(
97
  lines=5,
98
+ label="ํ•œ๊ตญ์–ด ๊ฒฐ๊ณผ (Korean)",
99
+ interactive=False,
100
+ show_copy_button=True # ๋ณต์‚ฌ ๋ฒ„ํŠผ ์ถ”๊ฐ€
101
  )
102
 
103
+ # ์˜ˆ์ œ ๋ฐ์ดํ„ฐ
104
  gr.Examples(
105
  examples=[
106
+ ["The quick brown fox jumps over the lazy dog.", "์˜์–ด (English)"],
107
+ ["AIใฎ็™บๅฑ•ใซใ‚ˆใฃใฆใ€็งใŸใกใฎ็”Ÿๆดปใฏๅคงใใๅค‰ๅŒ–ใ—ใฆใ„ใพใ™ใ€‚", "์ผ๋ณธ์–ด (Japanese)"],
108
+ ["ไปŠๅคฉๅคฉๆฐ”็œŸๅฅฝ๏ผŒๆˆ‘ไปฌๅŽปๅ…ฌๅ›ญๆ•ฃๆญฅๅงใ€‚", "์ค‘๊ตญ์–ด (Chinese Simplified)"]
109
  ],
110
  inputs=[input_text, src_lang]
111
  )
112
 
 
113
  translate_btn.click(
114
  fn=translate_text,
115
  inputs=[input_text, src_lang],