Spaces:

taaaranis
/

marine

Runtime error

App Files Files Community

taaaranis commited on Sep 2, 2025

Commit

5797b3f

verified ·

1 Parent(s): 4f3033e

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -66

app.py CHANGED Viewed

@@ -1,27 +1,51 @@
 import gradio as gr
 import numpy as np
 import cv2
-import os, json, re, base64, requests
 from typing import List, Dict
 from huggingface_hub import hf_hub_download, list_repo_files
 from ultralytics import YOLO
 # ---------- Config ----------
 FATHOM_REPO = os.getenv("FathomNet/fathomnet2023-comp-baseline", "FathomNet/fathomnet2023-comp-baseline")
 FATHOM_PREF = ["fathomnet23-comp-baseline.pt", "best.pt", "yolov8m.pt"]
-# 触发大模型兜底的阈值（根据你的要求，保持 0.80 不变）
 CONF_LOW = float(os.getenv("CONF_LOW", "0.8"))
-# LLM（通过 Hugging Face Inference API 调用）
-HF_TOKEN = os.getenv("HF_TOKEN")  # 必须：在 Space 的 Secrets 里配置
-# 【最终修改】更换为在免费API上稳定可靠的VQA模型，解决404问题
-LLM_MODEL_ID = (os.getenv("LLM_MODEL_ID", "").strip()
-                or "dandelin/vilt-b32-finetuned-vqa")
 # ---------- Utils ----------
 def _resolve_weight(repo_id: str, prefer: List[str]) -> str:
     for fname in prefer:
         try:
             return hf_hub_download(repo_id=repo_id, filename=fname)
@@ -33,6 +57,7 @@ def _resolve_weight(repo_id: str, prefer: List[str]) -> str:
     raise RuntimeError(f"No .pt weights found in repo: {repo_id}")
 def _resize_limit_max_side(img_bgr: np.ndarray, max_side: int = 1280) -> np.ndarray:
     h, w = img_bgr.shape[:2]
     m = max(h, w)
     if m <= max_side:
@@ -41,7 +66,7 @@ def _resize_limit_max_side(img_bgr: np.ndarray, max_side: int = 1280) -> np.ndar
     return cv2.resize(img_bgr, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_AREA)
 def uw_preprocess_bgr(img_bgr: np.ndarray) -> np.ndarray:
-    # 轻量水下增强（清晰照片可关闭）
     lab = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2LAB)
     l, a, b = cv2.split(lab); l = cv2.equalizeHist(l)
     img_bgr = cv2.cvtColor(cv2.merge([l, a, b]), cv2.COLOR_LAB2BGR)
@@ -54,6 +79,7 @@ def uw_preprocess_bgr(img_bgr: np.ndarray) -> np.ndarray:
     return np.uint8(np.clip(np.dstack([bch, gch, rch]) * t, 0, 255))
 def _parse_yolo_result(ultra_res, names: Dict[int, str]):
     dets = []
     for b in ultra_res.boxes:
         cls_id = int(b.cls.item())
@@ -64,67 +90,39 @@ def _parse_yolo_result(ultra_res, names: Dict[int, str]):
     return dets
 def _best_box(dets):
     if not dets: return None
     return max(dets, key=lambda d: d["conf"])
 def _crop_xyxy(img: np.ndarray, box, pad: int = 4) -> np.ndarray:
     h, w = img.shape[:2]
     x1, y1, x2, y2 = [int(round(v)) for v in box]
     x1 = max(0, x1 - pad); y1 = max(0, y1 - pad)
     x2 = min(w - 1, x2 + pad); y2 = min(h - 1, y2 + pad)
     return img[y1:y2, x1:x2, :]
-def _jpeg_bytes_from_bgr(bgr: np.ndarray, max_side: int = 768) -> bytes:
-    bgr = _resize_limit_max_side(bgr, max_side=max_side)
-    ok, buf = cv2.imencode(".jpg", bgr, [int(cv2.IMWRITE_JPEG_QUALITY), 90])
-    return buf.tobytes() if ok else b""
-# ---------- Load primary detector ----------
-print("[init] loading FathomNet baseline ...")
-FATHOM_W = _resolve_weight(FATHOM_REPO, FATHOM_PREF)
-FATHOM = YOLO(FATHOM_W)
-# ---------- LLM fallback (VQA model via Inference API) ----------
-# 【最终修改】重写API调用函数，以适配稳定VQA模型的接口和返回格式
-def _call_vision_model_api(model_id: str, img_bytes: bytes, question: str) -> Dict:
-    """调用Hugging Face上的视觉问答（VQA）模型"""
-    url = f"https://api-inference.huggingface.co/models/{model_id}"
-    img_b64 = base64.b64encode(img_bytes).decode("utf-8")
-    headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
-    # VQA模型使用不同的请求体结构
-    payload = {"inputs": {"question": question, "image": img_b64}}
-    r = requests.post(url, headers=headers, json=payload, timeout=60)
-    r.raise_for_status()
-    # VQA模型的返回格式: [{"score": 0.99, "answer": "..."}, ...]
-    results = r.json()
-    if results and isinstance(results, list) and results[0]:
-        top_result = results[0]
-        return {"label": top_result.get("answer", "vqa_parse_error"),
-                "conf": top_result.get("score", 0.5)}
-    else:
-        return {"label": "vqa_empty_result", "conf": 0.5}
-# 【最终修改】重写LLM备用逻辑，使其调用新的VQA函数
 def llm_fallback(img_bgr: np.ndarray) -> Dict:
-    """当主模型信心不足时，调用VQA模型进行识别。"""
-    if not HF_TOKEN:
-        return {"label": "unknown", "conf": 0.51, "xyxy": None, "note": "HF_TOKEN not set"}
-    jb = _jpeg_bytes_from_bgr(img_bgr, max_side=768)
-    # 构造一个适合VQA模型的“问题”
-    prompt_text = "What is the single, most prominent marine species in this image?"
     try:
-        result = _call_vision_model_api(LLM_MODEL_ID, jb, prompt_text)
-        result["xyxy"] = None  # VQA模型本身不提供边界框坐标
-        return result
     except Exception as e:
-        error_note = str(e).replace(HF_TOKEN, "HF_TOKEN_***") if HF_TOKEN else str(e)
-        return {"label": "unknown", "conf": 0.51, "xyxy": None, "note": f"LLM error: {error_note}"}
 # ---------- Inference ----------
@@ -151,14 +149,12 @@ def predict(
         use_llm = (len(dets) == 0) or (max([d["conf"] for d in dets]) < CONF_LOW if dets else True)
         if use_llm:
-            print("[info] Low confidence or no detection, triggering LLM fallback...")
             roi_img = bgr
             best = _best_box(dets)
             if best is not None:
                 roi_img = _crop_xyxy(bgr, best["xyxy"])
             llm_det = llm_fallback(roi_img)
             if best is not None and llm_det.get("xyxy") is None:
                 llm_det["xyxy"] = best["xyxy"]
             enhanced = [llm_det]
@@ -172,7 +168,6 @@ def predict(
             score = d["conf"]
             xyxy = d.get("xyxy")
             note = d.get("note")
             if xyxy and isinstance(xyxy, list) and len(xyxy) == 4:
                 x1, y1, x2, y2 = map(int, xyxy)
                 color = (0, 255, 0) if not use_llm else (255, 165, 0) # LLM用橙色框
@@ -180,7 +175,7 @@ def predict(
                 cv2.putText(vis, f"{label_show} {score:.2f}", (x1, max(12, y1-6)),
                             cv2.FONT_HERSHEY_SIMPLEX, 0.55, color, 2)
             else:
-                text_to_show = f"{label_show} {score:.2f} (LLM)"
                 if note:
                     text_to_show = note
                 cv2.putText(vis, text_to_show, (12, 24),
@@ -198,11 +193,11 @@ def predict(
         return None, [{"error": str(e)}]
 # ---------- Gradio UI ----------
-with gr.Blocks(title="Marine Species ID – YOLO primary + VQA fallback") as demo:
     gr.Markdown(
-        "### Marine Species Identification (no training)\n"
         "- **Primary**: FathomNet 2023 Baseline (YOLOv8m)\n"
-        f"- **Fallback**: Stable VQA Model via Hugging Face Inference API (triggered when max conf < {CONF_LOW} or no boxes)"
     )
     with gr.Row():
         with gr.Column(scale=5):
@@ -216,8 +211,9 @@ with gr.Blocks(title="Marine Species ID – YOLO primary + VQA fallback") as dem
             img_out = gr.Image(label="Detections", interactive=False)
             json_out = gr.JSON(label="Detections JSON (label/conf/xyxy)")
     btn.click(predict, inputs=[img_in, conf, iou, imgsz, pre], outputs=[img_out, json_out])
-    gr.Markdown(f"Tip: add `HF_TOKEN` in Settings → Repository secrets. "
-                f"Optional envs: `CONF_LOW={CONF_LOW}`, `LLM_MODEL_ID={LLM_MODEL_ID}`.")
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import numpy as np
 import cv2
+import os, json, re, base64
 from typing import List, Dict
 from huggingface_hub import hf_hub_download, list_repo_files
 from ultralytics import YOLO
+from PIL import Image
+import torch
+# 【重要】请确保你的Hugging Face Space的requirements.txt文件里包含以下库:
+# transformers
+# torch
+# sentencepiece
+# Pillow
 # ---------- Config ----------
 FATHOM_REPO = os.getenv("FathomNet/fathomnet2023-comp-baseline", "FathomNet/fathomnet2023-comp-baseline")
 FATHOM_PREF = ["fathomnet23-comp-baseline.pt", "best.pt", "yolov8m.pt"]
 CONF_LOW = float(os.getenv("CONF_LOW", "0.8"))
+# ---------- Load primary detector ----------
+print("[init] loading FathomNet baseline (YOLO)...")
+FATHOM_W = _resolve_weight(FATHOM_REPO, FATHOM_PREF)
+FATHOM = YOLO(FATHOM_W)
+# ----------【最终修改】加载一个本地的、可靠的备用模型，不再依赖不稳定的外部API ----------
+print("[init] loading fallback vision model (BLIP)...")
+try:
+    from transformers import BlipProcessor, BlipForConditionalGeneration
+    # 检查GPU是否可用
+    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"[init] Fallback model will run on: {DEVICE}")
+    # 加载模型和处理器
+    fallback_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+    fallback_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(DEVICE)
+    FALLBACK_MODEL_LOADED = True
+except ImportError:
+    print("[warn] transformers, torch, or pillow not installed. Fallback model will not be available.")
+    print("[warn] Please add 'transformers', 'torch', 'sentencepiece', 'Pillow' to your requirements.txt")
+    FALLBACK_MODEL_LOADED = False
+    DEVICE = "cpu"
 # ---------- Utils ----------
 def _resolve_weight(repo_id: str, prefer: List[str]) -> str:
+    # ... (此函数无需修改)
     for fname in prefer:
         try:
             return hf_hub_download(repo_id=repo_id, filename=fname)
     raise RuntimeError(f"No .pt weights found in repo: {repo_id}")
 def _resize_limit_max_side(img_bgr: np.ndarray, max_side: int = 1280) -> np.ndarray:
+    # ... (此函数无需修改)
     h, w = img_bgr.shape[:2]
     m = max(h, w)
     if m <= max_side:
     return cv2.resize(img_bgr, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_AREA)
 def uw_preprocess_bgr(img_bgr: np.ndarray) -> np.ndarray:
+    # ... (此函数无需修改)
     lab = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2LAB)
     l, a, b = cv2.split(lab); l = cv2.equalizeHist(l)
     img_bgr = cv2.cvtColor(cv2.merge([l, a, b]), cv2.COLOR_LAB2BGR)
     return np.uint8(np.clip(np.dstack([bch, gch, rch]) * t, 0, 255))
 def _parse_yolo_result(ultra_res, names: Dict[int, str]):
+    # ... (此函数无需修改)
     dets = []
     for b in ultra_res.boxes:
         cls_id = int(b.cls.item())
     return dets
 def _best_box(dets):
+    # ... (此函数无需修改)
     if not dets: return None
     return max(dets, key=lambda d: d["conf"])
 def _crop_xyxy(img: np.ndarray, box, pad: int = 4) -> np.ndarray:
+    # ... (此函数无需修改)
     h, w = img.shape[:2]
     x1, y1, x2, y2 = [int(round(v)) for v in box]
     x1 = max(0, x1 - pad); y1 = max(0, y1 - pad)
     x2 = min(w - 1, x2 + pad); y2 = min(h - 1, y2 + pad)
     return img[y1:y2, x1:x2, :]
+# 【最终修改】重写LLM备用逻辑，使其调用加载到本地的BLIP模型
 def llm_fallback(img_bgr: np.ndarray) -> Dict:
+    """当主模型信心不足时，调用本地BLIP模型生成描述。"""
+    if not FALLBACK_MODEL_LOADED:
+        return {"label": "unknown", "conf": 0.51, "xyxy": None, "note": "Fallback model not loaded"}
     try:
+        # 1. 将图像从OpenCV格式(BGR)转换为PIL格式(RGB)
+        raw_image = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))
+        # 2. 为模型准备输入
+        inputs = fallback_processor(raw_image, return_tensors="pt").to(DEVICE)
+        # 3. 生成描述
+        out = fallback_model.generate(**inputs, max_new_tokens=20)
+        # 4. 解码并清理描述文本
+        caption = fallback_processor.decode(out[0], skip_special_tokens=True)
+        # 这是一个简单的启发式清理，可能需要根据实际情况微调
+        caption = caption.replace("a photograph of", "").replace("a close up of", "").strip()
+        # BLIP模型不提供置信度分数，我们给一个固定的值以表明这是备用模型的结果
+        return {"label": caption, "conf": 0.60, "xyxy": None}
     except Exception as e:
+        return {"label": "unknown", "conf": 0.51, "xyxy": None, "note": f"Fallback model error: {e}"}
 # ---------- Inference ----------
         use_llm = (len(dets) == 0) or (max([d["conf"] for d in dets]) < CONF_LOW if dets else True)
         if use_llm:
+            print("[info] Low confidence or no detection, triggering LOCAL fallback model...")
             roi_img = bgr
             best = _best_box(dets)
             if best is not None:
                 roi_img = _crop_xyxy(bgr, best["xyxy"])
             llm_det = llm_fallback(roi_img)
             if best is not None and llm_det.get("xyxy") is None:
                 llm_det["xyxy"] = best["xyxy"]
             enhanced = [llm_det]
             score = d["conf"]
             xyxy = d.get("xyxy")
             note = d.get("note")
             if xyxy and isinstance(xyxy, list) and len(xyxy) == 4:
                 x1, y1, x2, y2 = map(int, xyxy)
                 color = (0, 255, 0) if not use_llm else (255, 165, 0) # LLM用橙色框
                 cv2.putText(vis, f"{label_show} {score:.2f}", (x1, max(12, y1-6)),
                             cv2.FONT_HERSHEY_SIMPLEX, 0.55, color, 2)
             else:
+                text_to_show = f"{label_show} {score:.2f} (Fallback Model)"
                 if note:
                     text_to_show = note
                 cv2.putText(vis, text_to_show, (12, 24),
         return None, [{"error": str(e)}]
 # ---------- Gradio UI ----------
+with gr.Blocks(title="Marine Species ID – YOLO primary + Local Fallback") as demo:
     gr.Markdown(
+        "### Marine Species Identification (with Self-Contained Fallback)\n"
         "- **Primary**: FathomNet 2023 Baseline (YOLOv8m)\n"
+        f"- **Fallback**: Local BLIP Model (triggered when max conf < {CONF_LOW} or no boxes)"
     )
     with gr.Row():
         with gr.Column(scale=5):
             img_out = gr.Image(label="Detections", interactive=False)
             json_out = gr.JSON(label="Detections JSON (label/conf/xyxy)")
     btn.click(predict, inputs=[img_in, conf, iou, imgsz, pre], outputs=[img_out, json_out])
+    gr.Markdown("Tip: This app is now self-contained and does not require an HF_TOKEN. "
+                "The first launch may be slow due to model downloads.")
 if __name__ == "__main__":
     demo.launch()