import json import argparse from pathlib import Path import chromadb from chromadb.config import Settings def export_collection(collection, output_dir: Path, include_embeddings=False): """Export one ChromaDB collection to a JSON file.""" # Pull everything (large collections may need pagination) include_fields = ["documents", "metadatas"] if include_embeddings: include_fields.append("embeddings") items = collection.get(include=include_fields) data = [] for idx, _id in enumerate(items["ids"]): record = { "id": _id, "document": items["documents"][idx] if items.get("documents") else None, "metadata": items["metadatas"][idx] if items.get("metadatas") else None, } if include_embeddings: record["embedding"] = ( items["embeddings"][idx] if items.get("embeddings") else None ) data.append(record) # Write to .json out_path = output_dir / f"{collection.name}.json" out_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8") print(f"āœ” Exported {collection.name} → {out_path}") def main(): parser = argparse.ArgumentParser(description="Export ChromaDB collections to JSON.") parser.add_argument( "--db-path", type=str, required=True, help="Path to the chromadb_store folder (where the DB is persisted)", ) parser.add_argument( "--output", type=str, default="chroma_exports", help="Output folder for json files", ) parser.add_argument( "--include-embeddings", action="store_true", help="Include embeddings in the export (off by default)", ) args = parser.parse_args() db_path = Path(args.db_path).expanduser().resolve() output_dir = Path(args.output).expanduser().resolve() output_dir.mkdir(parents=True, exist_ok=True) # Connect to the persistent ChromaDB store client = chromadb.PersistentClient( path=str(db_path), settings=Settings(anonymized_telemetry=False) ) # Iterate collections for cname in client.list_collections(): collection = client.get_collection(cname.name) export_collection(collection, output_dir, args.include_embeddings) print("\nšŸŽ‰ All collections exported!") if __name__ == "__main__": main()