import json import argparse from pathlib import Path import chromadb from chromadb.config import Settings def import_collection(client, json_file: Path, include_embeddings=False): """Import a JSON file into a ChromaDB collection.""" collection_name = json_file.stem print(f"šŸ“„ Importing {collection_name} from {json_file}") # Load JSON data = json.loads(json_file.read_text(encoding="utf-8")) # Extract fields ids = [item["id"] for item in data] documents = [item.get("document") for item in data] metadatas = [item.get("metadata") for item in data] if include_embeddings: embeddings = [item.get("embedding") for item in data] else: embeddings = None # Create or get collection collection = client.get_or_create_collection(collection_name) # Add to collection collection.add( ids=ids, documents=documents, metadatas=metadatas, embeddings=embeddings ) print(f"āœ” Imported {len(ids)} items into {collection_name}") def main(): parser = argparse.ArgumentParser(description="Import JSON files into ChromaDB collections.") parser.add_argument( "--db-path", type=str, required=True, help="Path to the target chromadb_store folder", ) parser.add_argument( "--input", type=str, default="chroma_exports", help="Folder containing JSON files to import", ) parser.add_argument( "--include-embeddings", action="store_true", help="Load embeddings from JSON (off by default)", ) args = parser.parse_args() db_path = Path(args.db_path).expanduser().resolve() input_dir = Path(args.input).expanduser().resolve() if not input_dir.exists(): print(f"āŒ Input folder does not exist: {input_dir}") return # Connect to ChromaDB client = chromadb.PersistentClient( path=str(db_path), settings=Settings(anonymized_telemetry=False) ) # Iterate JSON files for json_file in input_dir.glob("*.json"): import_collection(client, json_file, args.include_embeddings) print("\nšŸŽ‰ All JSON files imported!") if __name__ == "__main__": main()