Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -706,12 +706,13 @@ def stream_chat(
|
|
| 706 |
|
| 707 |
print(f"์
๋ ฅ ํ
์ ์์ฑ ํ CUDA ๋ฉ๋ชจ๋ฆฌ: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
|
| 708 |
|
| 709 |
-
|
|
|
|
|
|
|
| 710 |
streamer = TextIteratorStreamer(
|
| 711 |
-
tokenizer, timeout=
|
| 712 |
)
|
| 713 |
-
|
| 714 |
-
# ์์ฑ ๋งค๊ฐ๋ณ์ ์ค์
|
| 715 |
generate_kwargs = dict(
|
| 716 |
**inputs,
|
| 717 |
streamer=streamer,
|
|
@@ -721,9 +722,13 @@ def stream_chat(
|
|
| 721 |
max_new_tokens=max_new_tokens,
|
| 722 |
do_sample=True,
|
| 723 |
temperature=temperature,
|
| 724 |
-
|
|
|
|
|
|
|
| 725 |
)
|
| 726 |
|
|
|
|
|
|
|
| 727 |
# ๋ฉ๋ชจ๋ฆฌ ์ ๋ฆฌ
|
| 728 |
clear_cuda_memory()
|
| 729 |
|
|
@@ -731,35 +736,26 @@ def stream_chat(
|
|
| 731 |
thread = Thread(target=model.generate, kwargs=generate_kwargs)
|
| 732 |
thread.start()
|
| 733 |
|
| 734 |
-
#
|
| 735 |
buffer = ""
|
| 736 |
-
partial_message = ""
|
| 737 |
-
last_yield_time = time.time()
|
| 738 |
-
|
| 739 |
try:
|
| 740 |
for new_text in streamer:
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
# ์ผ์ ์๊ฐ๋ง๋ค ๋๋ ํ
์คํธ๊ฐ ์์ผ ๋๋ง๋ค ๊ฒฐ๊ณผ ์
๋ฐ์ดํธ
|
| 745 |
-
current_time = time.time()
|
| 746 |
-
if current_time - last_yield_time > 0.1 or len(partial_message) > 20:
|
| 747 |
yield "", history + [[message, buffer]]
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
|
| 753 |
-
|
| 754 |
-
|
| 755 |
-
# ๋ํ ๊ธฐ๋ก์ ์ ์ฅ
|
| 756 |
-
chat_history.add_conversation(message, buffer)
|
| 757 |
-
|
| 758 |
-
except Exception as e:
|
| 759 |
-
print(f"์คํธ๋ฆฌ๋ฐ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}")
|
| 760 |
-
if not buffer: # ๋ฒํผ๊ฐ ๋น์ด์์ผ๋ฉด ์ค๋ฅ ๋ฉ์์ง ํ์
|
| 761 |
-
buffer = f"์๋ต ์์ฑ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
|
| 762 |
yield "", history + [[message, buffer]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 763 |
|
| 764 |
# ์ค๋ ๋๊ฐ ์ฌ์ ํ ์คํ ์ค์ด๋ฉด ์ข
๋ฃ ๋๊ธฐ
|
| 765 |
if thread.is_alive():
|
|
|
|
| 706 |
|
| 707 |
print(f"์
๋ ฅ ํ
์ ์์ฑ ํ CUDA ๋ฉ๋ชจ๋ฆฌ: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
|
| 708 |
|
| 709 |
+
|
| 710 |
+
try:
|
| 711 |
+
# ์คํธ๋ฆฌ๋จธ ์ด๊ธฐํ ์ ํ์์์์ ๋ ๊ธธ๊ฒ ์ค์
|
| 712 |
streamer = TextIteratorStreamer(
|
| 713 |
+
tokenizer, timeout=30.0, skip_prompt=True, skip_special_tokens=True
|
| 714 |
)
|
| 715 |
+
|
|
|
|
| 716 |
generate_kwargs = dict(
|
| 717 |
**inputs,
|
| 718 |
streamer=streamer,
|
|
|
|
| 722 |
max_new_tokens=max_new_tokens,
|
| 723 |
do_sample=True,
|
| 724 |
temperature=temperature,
|
| 725 |
+
pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id,
|
| 726 |
+
eos_token_id=tokenizer.eos_token_id,
|
| 727 |
+
use_cache=True
|
| 728 |
)
|
| 729 |
|
| 730 |
+
|
| 731 |
+
|
| 732 |
# ๋ฉ๋ชจ๋ฆฌ ์ ๋ฆฌ
|
| 733 |
clear_cuda_memory()
|
| 734 |
|
|
|
|
| 736 |
thread = Thread(target=model.generate, kwargs=generate_kwargs)
|
| 737 |
thread.start()
|
| 738 |
|
| 739 |
+
# ์คํธ๋ฆฌ๋ฐ ์ฒ๋ฆฌ ์ค ์์ธ ์ฒ๋ฆฌ ๊ฐํ
|
| 740 |
buffer = ""
|
|
|
|
|
|
|
|
|
|
| 741 |
try:
|
| 742 |
for new_text in streamer:
|
| 743 |
+
try:
|
| 744 |
+
buffer += new_text
|
|
|
|
|
|
|
|
|
|
|
|
|
| 745 |
yield "", history + [[message, buffer]]
|
| 746 |
+
except Exception as inner_e:
|
| 747 |
+
print(f"๊ฐ๋ณ ํ ํฐ ์ฒ๋ฆฌ ์ค ์ค๋ฅ: {str(inner_e)}")
|
| 748 |
+
continue
|
| 749 |
+
except Exception as stream_e:
|
| 750 |
+
print(f"์คํธ๋ฆฌ๋ฐ ์ ์ฒด ์ค๋ฅ: {str(stream_e)}")
|
| 751 |
+
if not buffer:
|
| 752 |
+
buffer = "์๋ต ์์ฑ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 753 |
yield "", history + [[message, buffer]]
|
| 754 |
+
except Exception as outer_e:
|
| 755 |
+
print(f"์ ์ฒด ์์ฑ ๊ณผ์ ์ค๋ฅ: {str(outer_e)}")
|
| 756 |
+
yield "", history + [[message, "์ฃ์กํฉ๋๋ค, ์๋ต์ ์์ฑํ ์ ์์ต๋๋ค."]]
|
| 757 |
+
|
| 758 |
+
]
|
| 759 |
|
| 760 |
# ์ค๋ ๋๊ฐ ์ฌ์ ํ ์คํ ์ค์ด๋ฉด ์ข
๋ฃ ๋๊ธฐ
|
| 761 |
if thread.is_alive():
|