Spaces:

Arif-Badhon
/

RAG-Observability-Platform

Sleeping

RAG-Observability-Platform / src /ingestion /pipeline.py

Arif

Initial commit (Clean history)

faa44eb 11 days ago

2.86 kB

	# src/ingestion/pipeline.py
	import os
	import mlflow
	import chromadb
	from langchain_community.document_loaders import TextLoader, DirectoryLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_community.vectorstores import Chroma
	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv()

	# Configuration
	DATA_PATH = "data/raw"
	DB_PATH = "data/chroma_db"
	COLLECTION_NAME = "rag_experiments"
	EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

	class IngestionPipeline:
	def __init__(self):
	self.embeddings = HuggingFaceEmbeddings(
	model_name=EMBEDDING_MODEL,
	model_kwargs={'device': 'mps'} # Use M4 MPS for embeddings
	)

	def load_documents(self):
	"""Loads text files from the data directory."""
	loader = DirectoryLoader(DATA_PATH, glob="*.txt", loader_cls=TextLoader)
	documents = loader.load()
	print(f"📄 Loaded {len(documents)} documents.")
	return documents

	def chunk_documents(self, documents, chunk_size=1000, chunk_overlap=200):
	"""Splits documents into smaller chunks."""
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap
	)
	chunks = text_splitter.split_documents(documents)
	print(f"🧩 Split into {len(chunks)} chunks.")
	return chunks

	def store_embeddings(self, chunks):
	"""Embeds chunks and stores them in ChromaDB."""
	if os.path.exists(DB_PATH):
	print("⚠️ Existing DB found. Appending...")

	vectorstore = Chroma.from_documents(
	documents=chunks,
	embedding=self.embeddings,
	persist_directory=DB_PATH,
	collection_name=COLLECTION_NAME
	)
	print(f"💾 Saved to {DB_PATH}")
	return vectorstore

	def run(self):
	"""Runs the full pipeline with MLflow tracking."""
	mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))

	with mlflow.start_run(run_name="Ingestion_Phase_2"):
	# Log Parameters
	mlflow.log_param("embedding_model", EMBEDDING_MODEL)
	mlflow.log_param("chunk_size", 1000)
	mlflow.log_param("chunk_overlap", 200)

	# Execute Steps
	docs = self.load_documents()
	chunks = self.chunk_documents(docs)
	self.store_embeddings(chunks)

	# Log Metrics
	mlflow.log_metric("num_documents", len(docs))
	mlflow.log_metric("num_chunks", len(chunks))

	print("✅ Ingestion complete and logged to Dagshub!")

	if __name__ == "__main__":
	pipeline = IngestionPipeline()
	pipeline.run()