Spaces:
Running
Running
| """Utilities for persisting bookmarked repositories and their vector stores.""" | |
| from __future__ import annotations | |
| import hashlib | |
| import json | |
| import re | |
| import shutil | |
| from dataclasses import dataclass | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| from typing import Any, Dict, List, Optional, Tuple | |
| from agent import build_repo_vector_store | |
| BOOKMARKS_ROOT = Path("data/bookmarks") | |
| CACHE_ROOT = Path("data/cache") | |
| DOCS_SUBDIR = "docs" | |
| VECTOR_SUBDIR = "vector" | |
| class BookmarkMetadata: | |
| slug: str | |
| repo_name: str | |
| repo_url: str | |
| last_pulled_display: str | |
| last_pulled_iso: str | |
| docs_count: int | |
| vector_chunks: int | |
| dropdown_label: str | |
| docs_dir: str | |
| vector_dir: str | |
| summary_preview: str = "" | |
| def to_payload(self) -> Dict[str, Any]: | |
| return { | |
| "slug": self.slug, | |
| "repo_name": self.repo_name, | |
| "repo_url": self.repo_url, | |
| "last_pulled_display": self.last_pulled_display, | |
| "last_pulled_iso": self.last_pulled_iso, | |
| "docs_count": self.docs_count, | |
| "vector_chunks": self.vector_chunks, | |
| "dropdown_label": self.dropdown_label, | |
| "docs_dir": self.docs_dir, | |
| "vector_dir": self.vector_dir, | |
| "summary_preview": self.summary_preview, | |
| } | |
| def _ensure_root() -> None: | |
| BOOKMARKS_ROOT.mkdir(parents=True, exist_ok=True) | |
| def _ensure_cache_root() -> None: | |
| CACHE_ROOT.mkdir(parents=True, exist_ok=True) | |
| def slugify_repo(repo_url: str, repo_name: str) -> str: | |
| safe_name = re.sub(r"[^a-zA-Z0-9-]+", "-", repo_name.lower()).strip("-") | |
| digest = hashlib.sha1(repo_url.encode("utf-8")).hexdigest()[:8] | |
| if safe_name: | |
| safe_name = re.sub(r"-+", "-", safe_name) | |
| return f"{safe_name}-{digest}" | |
| return digest | |
| def _format_label(repo_name: str, display_date: str, slug: str) -> str: | |
| short_slug = slug[:6].upper() | |
| return f"{repo_name} – {display_date} · {short_slug}" | |
| def get_cache_dirs(repo_url: str, repo_name: str) -> Tuple[str, Path, Path]: | |
| _ensure_cache_root() | |
| slug = slugify_repo(repo_url, repo_name) | |
| cache_dir = CACHE_ROOT / slug | |
| vector_dir = cache_dir / VECTOR_SUBDIR | |
| return slug, cache_dir, vector_dir | |
| def _write_docs(docs_dir: Path, documentation: List[Dict[str, Any]]) -> None: | |
| if docs_dir.exists(): | |
| shutil.rmtree(docs_dir) | |
| docs_dir.mkdir(parents=True, exist_ok=True) | |
| for doc in documentation: | |
| rel_path = Path(doc.get("path") or "document.txt") | |
| target = docs_dir / rel_path | |
| target.parent.mkdir(parents=True, exist_ok=True) | |
| target.write_text(doc.get("content", ""), encoding="utf-8") | |
| def _select_summary_preview(documentation: List[Dict[str, Any]], *, max_chars: int = 800) -> str: | |
| def normalize(text: str) -> str: | |
| return " ".join((text or "").split()) | |
| preferred_doc = None | |
| for doc in documentation: | |
| path = (doc.get("path") or "").lower() | |
| if "readme" in path or "overview" in path or path.endswith(".md"): | |
| preferred_doc = doc | |
| break | |
| if not preferred_doc and documentation: | |
| preferred_doc = documentation[0] | |
| if not preferred_doc: | |
| return "" | |
| content = normalize(preferred_doc.get("content") or "") | |
| if not content: | |
| return "" | |
| snippet = content[:max_chars] | |
| return snippet | |
| def bookmark_repo_from_analysis( | |
| repo_url: str, | |
| analysis: Dict[str, Any], | |
| *, | |
| prebuilt_vector_dir: Optional[Path] = None, | |
| prebuilt_chunks: Optional[int] = None, | |
| ) -> BookmarkMetadata: | |
| documentation = analysis.get("documentation") or [] | |
| repo_name = analysis.get("repo_name") or repo_url | |
| _ensure_root() | |
| slug = slugify_repo(repo_url, repo_name) | |
| repo_dir = BOOKMARKS_ROOT / slug | |
| docs_dir = repo_dir / DOCS_SUBDIR | |
| vector_dir = repo_dir / VECTOR_SUBDIR | |
| _write_docs(docs_dir, documentation) | |
| summary_preview = _select_summary_preview(documentation) | |
| chunk_total = 0 | |
| if prebuilt_vector_dir and prebuilt_vector_dir.exists(): | |
| if vector_dir.exists(): | |
| shutil.rmtree(vector_dir) | |
| shutil.copytree(prebuilt_vector_dir, vector_dir, dirs_exist_ok=True) | |
| chunk_total = prebuilt_chunks if prebuilt_chunks is not None else chunk_total | |
| else: | |
| vectorstore, chunk_count = build_repo_vector_store(documentation, persist_path=vector_dir) | |
| chunk_total = chunk_count if vectorstore is not None else 0 | |
| now = datetime.now(timezone.utc) | |
| display_date = now.strftime("%d/%m/%Y") | |
| metadata = BookmarkMetadata( | |
| slug=slug, | |
| repo_name=repo_name, | |
| repo_url=repo_url, | |
| last_pulled_display=display_date, | |
| last_pulled_iso=now.isoformat(), | |
| docs_count=len(documentation), | |
| vector_chunks=chunk_total, | |
| dropdown_label=_format_label(repo_name, display_date, slug), | |
| docs_dir=str(docs_dir), | |
| vector_dir=str(vector_dir), | |
| summary_preview=summary_preview, | |
| ) | |
| (repo_dir / "metadata.json").write_text(json.dumps(metadata.to_payload(), indent=2), encoding="utf-8") | |
| return metadata | |
| def _load_metadata_file(meta_path: Path) -> Optional[BookmarkMetadata]: | |
| try: | |
| raw = json.loads(meta_path.read_text(encoding="utf-8")) | |
| except (OSError, json.JSONDecodeError): | |
| return None | |
| slug = raw.get("slug") or meta_path.parent.name | |
| repo_name = raw.get("repo_name") or slug | |
| display = raw.get("last_pulled_display") or raw.get("last_pulled_iso") or "--/--/----" | |
| label = raw.get("dropdown_label") or _format_label(repo_name, display, slug) | |
| return BookmarkMetadata( | |
| slug=slug, | |
| repo_name=repo_name, | |
| repo_url=raw.get("repo_url", ""), | |
| last_pulled_display=display, | |
| last_pulled_iso=raw.get("last_pulled_iso", ""), | |
| docs_count=int(raw.get("docs_count", 0)), | |
| vector_chunks=int(raw.get("vector_chunks", 0)), | |
| dropdown_label=label, | |
| docs_dir=raw.get("docs_dir", str(meta_path.parent / DOCS_SUBDIR)), | |
| vector_dir=raw.get("vector_dir", str(meta_path.parent / VECTOR_SUBDIR)), | |
| summary_preview=raw.get("summary_preview", ""), | |
| ) | |
| def list_bookmark_metadata() -> List[Dict[str, Any]]: | |
| _ensure_root() | |
| entries: List[Dict[str, Any]] = [] | |
| for child in BOOKMARKS_ROOT.iterdir(): | |
| meta = _load_metadata_file(child / "metadata.json") | |
| if not meta: | |
| continue | |
| entries.append(meta.to_payload()) | |
| entries.sort(key=lambda item: item.get("last_pulled_iso", ""), reverse=True) | |
| return entries | |
| def get_dropdown_options() -> Tuple[List[str], List[Dict[str, Any]]]: | |
| metadata_list = list_bookmark_metadata() | |
| choices = [entry["dropdown_label"] for entry in metadata_list] | |
| return choices, metadata_list | |
| def find_metadata_by_label(label: str, metadata_list: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]: | |
| for entry in metadata_list: | |
| if entry.get("dropdown_label") == label: | |
| return entry | |
| return None | |
| def find_metadata_by_url(repo_url: str) -> Optional[Dict[str, Any]]: | |
| if not repo_url: | |
| return None | |
| for entry in list_bookmark_metadata(): | |
| if entry.get("repo_url") == repo_url: | |
| return entry | |
| return None | |