Spaces:

MCP-1st-Birthday
/

monkey-mind

Running

App Files Files Community

monkey-mind / bookmarks.py

sxandie

pushing first iter

af8e698 verified 17 days ago

raw

history blame contribute delete

7.5 kB

	"""Utilities for persisting bookmarked repositories and their vector stores."""

	from __future__ import annotations

	import hashlib
	import json
	import re
	import shutil
	from dataclasses import dataclass
	from datetime import datetime, timezone
	from pathlib import Path
	from typing import Any, Dict, List, Optional, Tuple

	from agent import build_repo_vector_store

	BOOKMARKS_ROOT = Path("data/bookmarks")
	CACHE_ROOT = Path("data/cache")
	DOCS_SUBDIR = "docs"
	VECTOR_SUBDIR = "vector"


	@dataclass
	class BookmarkMetadata:
	slug: str
	repo_name: str
	repo_url: str
	last_pulled_display: str
	last_pulled_iso: str
	docs_count: int
	vector_chunks: int
	dropdown_label: str
	docs_dir: str
	vector_dir: str
	summary_preview: str = ""

	def to_payload(self) -> Dict[str, Any]:
	return {
	"slug": self.slug,
	"repo_name": self.repo_name,
	"repo_url": self.repo_url,
	"last_pulled_display": self.last_pulled_display,
	"last_pulled_iso": self.last_pulled_iso,
	"docs_count": self.docs_count,
	"vector_chunks": self.vector_chunks,
	"dropdown_label": self.dropdown_label,
	"docs_dir": self.docs_dir,
	"vector_dir": self.vector_dir,
	"summary_preview": self.summary_preview,
	}


	def _ensure_root() -> None:
	BOOKMARKS_ROOT.mkdir(parents=True, exist_ok=True)


	def _ensure_cache_root() -> None:
	CACHE_ROOT.mkdir(parents=True, exist_ok=True)


	def slugify_repo(repo_url: str, repo_name: str) -> str:
	safe_name = re.sub(r"[^a-zA-Z0-9-]+", "-", repo_name.lower()).strip("-")
	digest = hashlib.sha1(repo_url.encode("utf-8")).hexdigest()[:8]
	if safe_name:
	safe_name = re.sub(r"-+", "-", safe_name)
	return f"{safe_name}-{digest}"
	return digest


	def _format_label(repo_name: str, display_date: str, slug: str) -> str:
	short_slug = slug[:6].upper()
	return f"{repo_name} – {display_date} · {short_slug}"


	def get_cache_dirs(repo_url: str, repo_name: str) -> Tuple[str, Path, Path]:
	_ensure_cache_root()
	slug = slugify_repo(repo_url, repo_name)
	cache_dir = CACHE_ROOT / slug
	vector_dir = cache_dir / VECTOR_SUBDIR
	return slug, cache_dir, vector_dir


	def _write_docs(docs_dir: Path, documentation: List[Dict[str, Any]]) -> None:
	if docs_dir.exists():
	shutil.rmtree(docs_dir)
	docs_dir.mkdir(parents=True, exist_ok=True)
	for doc in documentation:
	rel_path = Path(doc.get("path") or "document.txt")
	target = docs_dir / rel_path
	target.parent.mkdir(parents=True, exist_ok=True)
	target.write_text(doc.get("content", ""), encoding="utf-8")


	def _select_summary_preview(documentation: List[Dict[str, Any]], *, max_chars: int = 800) -> str:
	def normalize(text: str) -> str:
	return " ".join((text or "").split())

	preferred_doc = None
	for doc in documentation:
	path = (doc.get("path") or "").lower()
	if "readme" in path or "overview" in path or path.endswith(".md"):
	preferred_doc = doc
	break
	if not preferred_doc and documentation:
	preferred_doc = documentation[0]
	if not preferred_doc:
	return ""
	content = normalize(preferred_doc.get("content") or "")
	if not content:
	return ""
	snippet = content[:max_chars]
	return snippet


	def bookmark_repo_from_analysis(
	repo_url: str,
	analysis: Dict[str, Any],
	*,
	prebuilt_vector_dir: Optional[Path] = None,
	prebuilt_chunks: Optional[int] = None,
	) -> BookmarkMetadata:
	documentation = analysis.get("documentation") or []
	repo_name = analysis.get("repo_name") or repo_url
	_ensure_root()

	slug = slugify_repo(repo_url, repo_name)
	repo_dir = BOOKMARKS_ROOT / slug
	docs_dir = repo_dir / DOCS_SUBDIR
	vector_dir = repo_dir / VECTOR_SUBDIR

	_write_docs(docs_dir, documentation)
	summary_preview = _select_summary_preview(documentation)
	chunk_total = 0
	if prebuilt_vector_dir and prebuilt_vector_dir.exists():
	if vector_dir.exists():
	shutil.rmtree(vector_dir)
	shutil.copytree(prebuilt_vector_dir, vector_dir, dirs_exist_ok=True)
	chunk_total = prebuilt_chunks if prebuilt_chunks is not None else chunk_total
	else:
	vectorstore, chunk_count = build_repo_vector_store(documentation, persist_path=vector_dir)
	chunk_total = chunk_count if vectorstore is not None else 0

	now = datetime.now(timezone.utc)
	display_date = now.strftime("%d/%m/%Y")
	metadata = BookmarkMetadata(
	slug=slug,
	repo_name=repo_name,
	repo_url=repo_url,
	last_pulled_display=display_date,
	last_pulled_iso=now.isoformat(),
	docs_count=len(documentation),
	vector_chunks=chunk_total,
	dropdown_label=_format_label(repo_name, display_date, slug),
	docs_dir=str(docs_dir),
	vector_dir=str(vector_dir),
	summary_preview=summary_preview,
	)

	(repo_dir / "metadata.json").write_text(json.dumps(metadata.to_payload(), indent=2), encoding="utf-8")
	return metadata


	def _load_metadata_file(meta_path: Path) -> Optional[BookmarkMetadata]:
	try:
	raw = json.loads(meta_path.read_text(encoding="utf-8"))
	except (OSError, json.JSONDecodeError):
	return None

	slug = raw.get("slug") or meta_path.parent.name
	repo_name = raw.get("repo_name") or slug
	display = raw.get("last_pulled_display") or raw.get("last_pulled_iso") or "--/--/----"
	label = raw.get("dropdown_label") or _format_label(repo_name, display, slug)
	return BookmarkMetadata(
	slug=slug,
	repo_name=repo_name,
	repo_url=raw.get("repo_url", ""),
	last_pulled_display=display,
	last_pulled_iso=raw.get("last_pulled_iso", ""),
	docs_count=int(raw.get("docs_count", 0)),
	vector_chunks=int(raw.get("vector_chunks", 0)),
	dropdown_label=label,
	docs_dir=raw.get("docs_dir", str(meta_path.parent / DOCS_SUBDIR)),
	vector_dir=raw.get("vector_dir", str(meta_path.parent / VECTOR_SUBDIR)),
	summary_preview=raw.get("summary_preview", ""),
	)


	def list_bookmark_metadata() -> List[Dict[str, Any]]:
	_ensure_root()
	entries: List[Dict[str, Any]] = []
	for child in BOOKMARKS_ROOT.iterdir():
	meta = _load_metadata_file(child / "metadata.json")
	if not meta:
	continue
	entries.append(meta.to_payload())
	entries.sort(key=lambda item: item.get("last_pulled_iso", ""), reverse=True)
	return entries


	def get_dropdown_options() -> Tuple[List[str], List[Dict[str, Any]]]:
	metadata_list = list_bookmark_metadata()
	choices = [entry["dropdown_label"] for entry in metadata_list]
	return choices, metadata_list


	def find_metadata_by_label(label: str, metadata_list: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
	for entry in metadata_list:
	if entry.get("dropdown_label") == label:
	return entry
	return None


	def find_metadata_by_url(repo_url: str) -> Optional[Dict[str, Any]]:
	if not repo_url:
	return None
	for entry in list_bookmark_metadata():
	if entry.get("repo_url") == repo_url:
	return entry
	return None