Spaces:

Wplotnikow
/

TeacherChat

Sleeping

App Files Files Community

Wplotnikow commited on Aug 20

Commit

b1b58ec

verified ·

1 Parent(s): 2b9cf4d

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -14

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ def get_blocks_from_docx():
     blocks = []
     for p in doc.paragraphs:
         txt = p.text.strip()
-        # Убираем короткие заголовки:
         if (
             txt
             and not (len(txt) <= 3 and txt.isdigit())
@@ -24,14 +24,13 @@ def get_blocks_from_docx():
                 and txt == txt.upper()
                 and txt.endswith(('.', ':', '?', '!')) is False
             )
-            and len(txt.split()) > 3  # минимум 3 слова = явно не заголовок
         ):
             blocks.append(txt)
-    # Таблицы:
     for table in doc.tables:
         for row in table.rows:
             row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
-            # Аналогично — игнорируем сверхкороткие строки/возможные заголовки из таблиц:
             if row_text and len(row_text) > 35 and len(row_text.split()) > 3:
                 blocks.append(row_text)
     seen = set()
@@ -55,7 +54,12 @@ def rut5_answer(question, context):
     prompt = f"question: {question} context: {context}"
     input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
     with torch.no_grad():
-        output_ids = model.generate(input_ids, max_length=250, num_beams=4, min_length=40, no_repeat_ngram_size=3)
     return tokenizer.decode(output_ids[0], skip_special_tokens=True)
 def ask_chatbot(question):
@@ -65,15 +69,15 @@ def ask_chatbot(question):
         return "Ошибка: база знаний пуста или слишком мала. Проверьте .docx."
     user_vec = vectorizer.transform([question])
     sims = cosine_similarity(user_vec, matrix)
-    # ТОП-3 самых осмысленных блока
-    top_idxs = sims.argsort()[-3:][::-1]
-    # Используем только НЕКОРОТКИЕ блоки как контекст
     context_blocks = [
         blocks[i] for i in top_idxs if sims[i] > 0.08 and len(blocks[i].split()) > 3 and len(blocks[i]) > 35
     ]
     context = " ".join(context_blocks)
     answer = rut5_answer(question, context)
-    # Подстраховка — если ответ ТОЛЬКО заголовок, просто версифицируем и дополняем контекстом:
     if len(answer.strip().split()) < 8 or len(answer.split('.')) < 2:
         answer += "\n\n" + context
     return answer
@@ -92,17 +96,22 @@ with gr.Blocks() as demo:
         """
         # Русскоязычный FAQ-чат-бот на базе вашей методички и нейросетевой модели
-        Задайте вопрос — получите развернутый AI-ответ на русском языке на основании вашего документа!
         """
     )
     question = gr.Textbox(label="Ваш вопрос", lines=2)
     ask_btn = gr.Button("Получить ответ")
     answer = gr.Markdown(label="Ответ", visible=True)
-    ask_btn.click(ask_chatbot, question, answer)
-    question.submit(ask_chatbot, question, answer)
-    gr.Markdown("#### Примеры вопросов:")
-    # ВОЗВРАЩАЕМ КЛИКАБЕЛЬНЫЕ примеры
     gr.Examples(EXAMPLES, inputs=question)
     gr.Markdown("""

     blocks = []
     for p in doc.paragraphs:
         txt = p.text.strip()
+        # Исключаем очень короткие и похожие на заголовки блоки
         if (
             txt
             and not (len(txt) <= 3 and txt.isdigit())
                 and txt == txt.upper()
                 and txt.endswith(('.', ':', '?', '!')) is False
             )
+            and len(txt.split()) > 3
         ):
             blocks.append(txt)
+    # Таблицы
     for table in doc.tables:
         for row in table.rows:
             row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
             if row_text and len(row_text) > 35 and len(row_text.split()) > 3:
                 blocks.append(row_text)
     seen = set()
     prompt = f"question: {question} context: {context}"
     input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
     with torch.no_grad():
+        output_ids = model.generate(
+            input_ids,
+            max_length=250, num_beams=4, min_length=40,
+            no_repeat_ngram_size=3,
+            do_sample=False
+        )
     return tokenizer.decode(output_ids[0], skip_special_tokens=True)
 def ask_chatbot(question):
         return "Ошибка: база знаний пуста или слишком мала. Проверьте .docx."
     user_vec = vectorizer.transform([question])
     sims = cosine_similarity(user_vec, matrix)
+    n_blocks = min(3, len(blocks))
+    top_idxs = sims.argsort()[-n_blocks:][::-1] if n_blocks > 0 else []
     context_blocks = [
         blocks[i] for i in top_idxs if sims[i] > 0.08 and len(blocks[i].split()) > 3 and len(blocks[i]) > 35
     ]
     context = " ".join(context_blocks)
+    if not context:
+        return "Не найден релевантный фрагмент в документе. Попробуйте иначе сформулировать вопрос."
     answer = rut5_answer(question, context)
     if len(answer.strip().split()) < 8 or len(answer.split('.')) < 2:
         answer += "\n\n" + context
     return answer
         """
         # Русскоязычный FAQ-чат-бот на базе вашей методички и нейросетевой модели
+        Задайте вопрос — получите развернутый AI-ответ (бот анализирует ваш документ, подборка абзацев и генерация выполняются автоматически).
         """
     )
     question = gr.Textbox(label="Ваш вопрос", lines=2)
     ask_btn = gr.Button("Получить ответ")
     answer = gr.Markdown(label="Ответ", visible=True)
+    # Активация спиннера "Чат-бот думает..."
+    def with_spinner(q):
+        yield "Чат-бот думает..."
+        yield ask_chatbot(q)
+    ask_btn.click(with_spinner, question, answer)
+    question.submit(with_spinner, question, answer)
+    gr.Markdown("#### Примеры вопросов:")
     gr.Examples(EXAMPLES, inputs=question)
     gr.Markdown("""