Spaces:

Cardiosense-AG
/

ai_econsult_demo

Paused

App Files Files Community

Cardiosense-AG commited on Nov 11

Commit

bcde322

verified ·

1 Parent(s): 4e46ea8

Update src/explainability.py

Browse files

Files changed (1) hide show

src/explainability.py +63 -103

src/explainability.py CHANGED Viewed

@@ -1,128 +1,88 @@
 # src/explainability.py
 from __future__ import annotations
 import math
-import os
 import re
-from functools import lru_cache
-from typing import Dict, List, Tuple
-# Streamlit is only used in optional helpers to avoid import-time overhead in non-UI contexts.
-try:
-    import streamlit as st  # noqa: F401
-except Exception:
-    st = None  # type: ignore
-# Prefer the same embedding model elsewhere in the Space
-E5_MODEL_ID = os.environ.get("E5_MODEL_ID", "intfloat/e5-base-v2")
-_WORD_RE = re.compile(r"[A-Za-z0-9\-\%]+")
-_STOPWORDS = {
-    "the","a","an","and","or","but","if","in","on","at","by","for","to","of","with","without",
-    "is","are","was","were","be","been","being","as","that","this","these","those","it","its",
-    "patient","pt","hx","h/o","pmh","psh","ros","pe","labs","lab","imaging","plan","assessment",
-    "subjective","objective","cc","chief","complaint"
-}
-def _tokens(s: str) -> List[str]:
-    return [w.lower() for w in _WORD_RE.findall(s or "") if w and w.lower() not in _STOPWORDS]
 def segment_claims(text: str) -> List[str]:
-    """Split free text into sentence-like claims."""
     if not text:
         return []
-    t = text.replace("•", ". ").replace("\n", " ").replace(" - ", ". ")
-    parts = re.split(r"(?<=[\.\?\!])\s+", t)
-    claims = [p.strip() for p in parts if len(p.strip()) > 12]
-    return claims[:16]
-@lru_cache(maxsize=1)
-def _get_e5():
-    """Load sentence-transformers E5 model lazily."""
-    from sentence_transformers import SentenceTransformer
-    model = SentenceTransformer(E5_MODEL_ID)
-    return model
-def _embed_query_passages(summary: str, claims: List[str]):
-    """E5 uses 'query:' for queries and 'passage:' for documents."""
-    model = _get_e5()
-    q = f"query: {summary.strip()}"
-    ps = [f"passage: {c.strip()}" for c in claims]
-    import numpy as np  # local import to avoid global dependency at parse-time
-    qv = model.encode(q, normalize_embeddings=True)
-    pvs = model.encode(ps, normalize_embeddings=True)
-    return qv, pvs
-def _cos(a, b):
-    import numpy as np
-    return float(np.dot(a, b))
-def _idf(corpus_tokens: List[List[str]]) -> Dict[str, float]:
-    N = max(1, len(corpus_tokens))
-    df: Dict[str,int] = {}
-    for toks in corpus_tokens:
-        for t in set(toks):
-            df[t] = df.get(t, 0) + 1
-    return {t: math.log((N + 1) / (df[t] + 0.5)) + 1.0 for t in df}
 def _tf(tokens: List[str]) -> Dict[str, float]:
     tf: Dict[str, float] = {}
     for t in tokens:
         tf[t] = tf.get(t, 0.0) + 1.0
-    L = float(len(tokens) or 1.0)
-    return {t: tf[t] / L for t in tf}
-def l1_normalize(weights: Dict[str, float]) -> Dict[str, float]:
-    s = sum(max(0.0, v) for v in weights.values())
-    if s <= 0:
-        return {k: 0.0 for k in weights}
-    return {k: max(0.0, v) / s for k, v in weights.items()}
-def compute_referral_tokens_for_section(
-    section_text: str,
-    referral_summary: str,
-    *,
-    top_n: int = 4,
-    min_weight: float = 0.03,
-) -> List[Dict]:
-    """Compute weighted referral tokens for a section based on post-hoc similarity.
-    Steps:
-      1) Segment section into sentence-like claims.
-      2) Get E5 similarity between referral summary and each claim.
-      3) For each claim, compute TF-IDF over its tokens; weight each claim's tokens by sim.
-      4) Aggregate across claims; L1-normalize over the section.
-      5) Return top-N tokens as chips (token, weight).
-    """
-    claims = segment_claims(section_text)
     if not claims:
         return []
-    # Tokenize claims for IDF
-    claim_tokens = [_tokens(c) for c in claims]
-    idf = _idf(claim_tokens)
-    # Embed for similarity
-    try:
-        qv, pvs = _embed_query_passages(referral_summary, claims)
-        sims = [_cos(qv, pv) for pv in pvs]  # already normalized vectors
-    except Exception:
-        # If embedding fails (no internet or package missing), fall back to simple heuristics
-        sims = [1.0 for _ in claims]
-    # Weight tokens per-claim and aggregate
     agg: Dict[str, float] = {}
-    for toks, sim in zip(claim_tokens, sims):
         tf = _tf(toks)
         for t, tv in tf.items():
-            w = tv * idf.get(t, 1.0) * max(0.0, sim)
-            agg[t] = agg.get(t, 0.0) + w
-    agg = l1_normalize(agg)
-    # Keep top_n tokens above cutoff
     ranked = sorted(agg.items(), key=lambda kv: kv[1], reverse=True)
-    chips = [{"token": tok, "weight": round(w, 4)} for tok, w in ranked if w >= min_weight][:top_n]
-    return chips

 # src/explainability.py
 from __future__ import annotations
+"""Explainability helpers (post-hoc only).
+Provides deterministic "chips" extracted from assessment/plan text.
+Caching by (case_id, section, text_hash) can be layered on top by the UI.
+"""
 import math
 import re
+from typing import Dict, List
+def _tokenize(s: str) -> List[str]:
+    s = s.lower()
+    # Keep simple alphanumerics
+    toks = re.findall(r"[a-z0-9]+", s)
+    return [t for t in toks if len(t) >= 3]
 def segment_claims(text: str) -> List[str]:
+    """Split text into claim-like sentences/lines."""
     if not text:
         return []
+    # Split by newline or period, keep moderately long segments
+    raw = re.split(r"[.\n]+", text)
+    claims = [c.strip() for c in raw if len(c.strip()) >= 12]
+    return claims[:10]
 def _tf(tokens: List[str]) -> Dict[str, float]:
     tf: Dict[str, float] = {}
     for t in tokens:
         tf[t] = tf.get(t, 0.0) + 1.0
+    s = sum(tf.values()) or 1.0
+    for k in list(tf.keys()):
+        tf[k] = tf[k] / s
+    return tf
+def _idf(docs: List[List[str]]) -> Dict[str, float]:
+    df: Dict[str, int] = {}
+    N = max(1, len(docs))
+    for doc in docs:
+        for t in set(doc):
+            df[t] = df.get(t, 0) + 1
+    return {t: math.log((N + 1) / (df_t + 1)) + 1.0 for t, df_t in df.items()}
+def chips_from_text(text: str, top_n: int = 10, min_weight: float = 0.02) -> List[Dict[str, float]]:
+    """Generate top-n weighted tokens from text using simple TF-IDF."""
+    claims = segment_claims(text)
     if not claims:
         return []
+    docs = [_tokenize(c) for c in claims]
+    idf = _idf(docs)
+    # Weight tokens by TF * average claim-length proxy
     agg: Dict[str, float] = {}
+    for toks in docs:
         tf = _tf(toks)
         for t, tv in tf.items():
+            agg[t] = agg.get(t, 0.0) + tv * idf.get(t, 1.0)
+    # Normalize L1
+    s = sum(agg.values()) or 1.0
+    for k in list(agg.keys()):
+        agg[k] /= s
     ranked = sorted(agg.items(), key=lambda kv: kv[1], reverse=True)
+    return [{"token": tok, "weight": round(w, 4)} for tok, w in ranked if w >= min_weight][:top_n]
+# --- V2 helpers (post-hoc only, deterministic) ---
+def chip_cache_key(case_id: str, section: str, text: str) -> str:
+    """Deterministic cache key for explainability chips."""
+    import hashlib, json
+    blob = json.dumps({"case_id": case_id, "section": section, "text": text}, sort_keys=True).encode("utf-8")
+    return hashlib.sha256(blob).hexdigest()
+def ensure_chip_schema(chips):
+    """Force a consistent chip schema: [{token, weight}] sorted by weight desc."""
+    if not isinstance(chips, (list, tuple)):
+        return []
+    norm = []
+    for c in chips:
+        if not isinstance(c, dict):
+            continue
+        tok = str(c.get("token", "")).strip()
+        w = float(c.get("weight", 0.0))
+        if tok:
+            norm.append({"token": tok, "weight": round(w, 4)})
+    norm.sort(key=lambda x: x["weight"], reverse=True)
+    return norm