Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -15,6 +15,7 @@ def get_blocks_from_docx():
|
|
| 15 |
blocks = []
|
| 16 |
for p in doc.paragraphs:
|
| 17 |
txt = p.text.strip()
|
|
|
|
| 18 |
if (
|
| 19 |
txt
|
| 20 |
and not (len(txt) <= 3 and txt.isdigit())
|
|
@@ -29,22 +30,22 @@ def get_blocks_from_docx():
|
|
| 29 |
for table in doc.tables:
|
| 30 |
for row in table.rows:
|
| 31 |
row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
|
| 32 |
-
if row_text and len(row_text) >
|
| 33 |
blocks.append(row_text)
|
| 34 |
seen = set()
|
| 35 |
uniq_blocks = []
|
| 36 |
for b in blocks:
|
| 37 |
-
if b not in seen:
|
| 38 |
uniq_blocks.append(b)
|
| 39 |
seen.add(b)
|
| 40 |
return uniq_blocks
|
| 41 |
|
| 42 |
blocks = get_blocks_from_docx()
|
| 43 |
|
| 44 |
-
if
|
| 45 |
blocks = ["База знаний пуста: проверьте содержание и формат вашего .docx!"]
|
| 46 |
|
| 47 |
-
vectorizer = TfidfVectorizer().fit(blocks)
|
| 48 |
matrix = vectorizer.transform(blocks)
|
| 49 |
|
| 50 |
tokenizer = T5Tokenizer.from_pretrained("cointegrated/rut5-base-multitask")
|
|
@@ -64,27 +65,29 @@ def rut5_answer(question, context):
|
|
| 64 |
return tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
| 65 |
|
| 66 |
def ask_chatbot(question):
|
| 67 |
-
|
|
|
|
|
|
|
| 68 |
return "Пожалуйста, введите вопрос."
|
| 69 |
if not blocks or blocks == ["База знаний пуста: проверьте содержание и формат вашего .docx!"]:
|
| 70 |
return "Ошибка: база знаний пуста. Проверьте .docx и перезапустите Space."
|
| 71 |
-
user_vec = vectorizer.transform([question])
|
| 72 |
sims = cosine_similarity(user_vec, matrix)
|
| 73 |
n_blocks = min(3, len(blocks))
|
| 74 |
-
# Корректно работают даже при len(blocks) == 1
|
| 75 |
top_idxs = list(reversed(sims.argsort()[-n_blocks:]))
|
| 76 |
context_blocks = []
|
| 77 |
-
for idx in top_idxs:
|
| 78 |
-
|
| 79 |
-
|
|
|
|
| 80 |
context_blocks.append(blocks[idx])
|
| 81 |
-
except IndexError:
|
| 82 |
-
continue
|
| 83 |
context = " ".join(context_blocks)
|
|
|
|
| 84 |
if not context:
|
| 85 |
-
return "Не найден
|
| 86 |
answer = rut5_answer(question, context)
|
| 87 |
-
|
|
|
|
| 88 |
answer += "\n\n" + context
|
| 89 |
return answer
|
| 90 |
|
|
@@ -122,4 +125,3 @@ with gr.Blocks() as demo:
|
|
| 122 |
""")
|
| 123 |
|
| 124 |
demo.launch()
|
| 125 |
-
|
|
|
|
| 15 |
blocks = []
|
| 16 |
for p in doc.paragraphs:
|
| 17 |
txt = p.text.strip()
|
| 18 |
+
# Исключаем короткие и заголовочные блоки
|
| 19 |
if (
|
| 20 |
txt
|
| 21 |
and not (len(txt) <= 3 and txt.isdigit())
|
|
|
|
| 30 |
for table in doc.tables:
|
| 31 |
for row in table.rows:
|
| 32 |
row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
|
| 33 |
+
if row_text and len(row_text) > 20 and len(row_text.split()) > 3:
|
| 34 |
blocks.append(row_text)
|
| 35 |
seen = set()
|
| 36 |
uniq_blocks = []
|
| 37 |
for b in blocks:
|
| 38 |
+
if b not in seen and len(b) > 0:
|
| 39 |
uniq_blocks.append(b)
|
| 40 |
seen.add(b)
|
| 41 |
return uniq_blocks
|
| 42 |
|
| 43 |
blocks = get_blocks_from_docx()
|
| 44 |
|
| 45 |
+
if not blocks:
|
| 46 |
blocks = ["База знаний пуста: проверьте содержание и формат вашего .docx!"]
|
| 47 |
|
| 48 |
+
vectorizer = TfidfVectorizer(lowercase=True).fit(blocks)
|
| 49 |
matrix = vectorizer.transform(blocks)
|
| 50 |
|
| 51 |
tokenizer = T5Tokenizer.from_pretrained("cointegrated/rut5-base-multitask")
|
|
|
|
| 65 |
return tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
| 66 |
|
| 67 |
def ask_chatbot(question):
|
| 68 |
+
# Регистронезависимый поиск! (lowercase everywhere)
|
| 69 |
+
question = question.strip()
|
| 70 |
+
if not question:
|
| 71 |
return "Пожалуйста, введите вопрос."
|
| 72 |
if not blocks or blocks == ["База знаний пуста: проверьте содержание и формат вашего .docx!"]:
|
| 73 |
return "Ошибка: база знаний пуста. Проверьте .docx и перезапустите Space."
|
| 74 |
+
user_vec = vectorizer.transform([question.lower()])
|
| 75 |
sims = cosine_similarity(user_vec, matrix)
|
| 76 |
n_blocks = min(3, len(blocks))
|
|
|
|
| 77 |
top_idxs = list(reversed(sims.argsort()[-n_blocks:]))
|
| 78 |
context_blocks = []
|
| 79 |
+
for rank, idx in enumerate(top_idxs):
|
| 80 |
+
# ПОНИЖЕННЫЙ ПОРОГ! Если ничего не найдено по порогу, всегда берём самый первый (лучший) блок
|
| 81 |
+
if sims[idx] > 0.05 or (rank == 0):
|
| 82 |
+
if len(blocks[idx].split()) > 3 and len(blocks[idx]) > 20:
|
| 83 |
context_blocks.append(blocks[idx])
|
|
|
|
|
|
|
| 84 |
context = " ".join(context_blocks)
|
| 85 |
+
# Если даже так не вышло — значит база совсем пуста
|
| 86 |
if not context:
|
| 87 |
+
return "Не найден ни один фрагмент для ответа. Проверьте, что в .docx есть содержательные абзацы."
|
| 88 |
answer = rut5_answer(question, context)
|
| 89 |
+
# Защита: требуем минимум два предложения ("." хотя бы 2 раза)
|
| 90 |
+
if len(answer.strip().split()) < 8 or answer.count('.') < 2:
|
| 91 |
answer += "\n\n" + context
|
| 92 |
return answer
|
| 93 |
|
|
|
|
| 125 |
""")
|
| 126 |
|
| 127 |
demo.launch()
|
|
|