Arif commited on
Commit
faa44eb
·
0 Parent(s):

Initial commit (Clean history)

Browse files
.gitignore ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+
12
+
13
+ #environments
14
+ .env
15
+
16
+ # Databases
17
+ *.db
18
+ *.sqlite3
19
+ data/chroma_db/
20
+ mlflow.db
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use official Python runtime
2
+ FROM python:3.10-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Install UV
8
+ COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
9
+
10
+ # Copy dependency files
11
+ COPY pyproject.toml uv.lock ./
12
+
13
+ # Install dependencies
14
+ # Note: We exclude mlx here because it's Mac-only.
15
+ # We install the rest of the project deps.
16
+ RUN uv sync --frozen --no-install-project
17
+
18
+ # Copy source code
19
+ COPY src ./src
20
+ COPY app ./app
21
+ COPY .env ./.env
22
+
23
+ # Expose Streamlit port (Must be 7860 for HF Spaces)
24
+ EXPOSE 7860
25
+
26
+ # Command to run the app
27
+ CMD ["uv", "run", "streamlit", "run", "app/frontend/app.py", "--server.address=0.0.0.0", "--server.port=7860"]
README.md ADDED
File without changes
app/frontend/app.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import sys
3
+ import os
4
+
5
+ # 1. Fix Path
6
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
7
+
8
+ from src.retrieval.rag_chain import build_rag_chain
9
+
10
+ st.set_page_config(page_title="RAG Observability Platform", layout="wide")
11
+ st.title("🤖 RAG Observability Platform")
12
+
13
+ if "messages" not in st.session_state:
14
+ st.session_state.messages = []
15
+
16
+ @st.cache_resource
17
+ def load_chain():
18
+ return build_rag_chain()
19
+
20
+ rag_chain = load_chain()
21
+
22
+ for message in st.session_state.messages:
23
+ with st.chat_message(message["role"]):
24
+ st.markdown(message["content"])
25
+
26
+ if prompt := st.chat_input("Ask a question..."):
27
+ st.session_state.messages.append({"role": "user", "content": prompt})
28
+ with st.chat_message("user"):
29
+ st.markdown(prompt)
30
+
31
+ with st.chat_message("assistant"):
32
+ with st.spinner("Thinking (M4 GPU)..."):
33
+ # LCEL Invoke (Direct String)
34
+ response = rag_chain.invoke(prompt)
35
+ st.markdown(response)
36
+
37
+ st.session_state.messages.append({"role": "assistant", "content": response})
data/raw/sample.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ The RAG Observability Platform is a project combining MLX for local inference and Docker for cloud deployment. It uses Dagshub for tracking experiments.
docker/Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use python 3.10 slim image
2
+ FROM python:3.10-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Install uv
8
+ COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
9
+
10
+ # Copy project files
11
+ COPY pyproject.toml uv.lock ./
12
+ COPY src ./src
13
+ COPY app ./app
14
+ COPY .env ./.env
15
+ COPY data ./data
16
+
17
+ # Install dependencies (Excluding the 'local' group which contains MLX)
18
+ RUN uv sync --frozen --no-install-project --no-group local
19
+
20
+ # Expose the port Streamlit runs on (7860 is mandatory for HF Spaces)
21
+ EXPOSE 7860
22
+
23
+ # Run the application
24
+ CMD ["uv", "run", "streamlit", "run", "app/frontend/app.py", "--server.port=7860", "--server.address=0.0.0.0"]
main.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ def main():
2
+ print("Hello from rag-observability-platform!")
3
+
4
+
5
+ if __name__ == "__main__":
6
+ main()
pyproject.toml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "rag-observability-platform"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "chromadb>=1.3.5",
9
+ "dagshub>=0.6.3",
10
+ "langchain>=1.1.2",
11
+ "langchain-chroma>=1.0.0",
12
+ "langchain-community>=0.4.1",
13
+ "langchain-huggingface>=1.1.0",
14
+ "mlflow>=3.7.0",
15
+ "pypdf>=6.4.0",
16
+ "python-dotenv>=1.2.1",
17
+ "sentence-transformers>=5.1.2",
18
+ "streamlit>=1.52.1",
19
+ ]
20
+
21
+ [dependency-groups]
22
+ local = [
23
+ "mlx>=0.30.0",
24
+ "mlx-lm>=0.28.4",
25
+ ]
src/generation/mlx_wrapper.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/generation/mlx_wrapper.py
2
+ import os
3
+ from typing import Any, List, Optional
4
+ from langchain_core.callbacks.manager import CallbackManagerForLLMRun
5
+ from langchain_core.language_models.llms import LLM
6
+ from mlx_lm import load, generate
7
+ from dotenv import load_dotenv
8
+
9
+ load_dotenv()
10
+
11
+ class MLXLLM(LLM):
12
+ """Custom LangChain Wrapper for MLX Models"""
13
+
14
+ model_id: str = os.getenv("MODEL_ID", "mlx-community/Llama-3.2-3B-Instruct-4bit")
15
+ model: Any = None
16
+ tokenizer: Any = None
17
+ max_tokens: int = int(os.getenv("MAX_TOKENS", 512))
18
+
19
+ def __init__(self, **kwargs):
20
+ super().__init__(**kwargs)
21
+ print(f"🚀 Loading MLX Model: {self.model_id}")
22
+ self.model, self.tokenizer = load(self.model_id)
23
+
24
+ @property
25
+ def _llm_type(self) -> str:
26
+ return "mlx_llama"
27
+
28
+ def _call(
29
+ self,
30
+ prompt: str,
31
+ stop: Optional[List[str]] = None,
32
+ run_manager: Optional[CallbackManagerForLLMRun] = None,
33
+ **kwargs: Any,
34
+ ) -> str:
35
+ if stop is not None:
36
+ raise ValueError("stop kwargs are not permitted.")
37
+
38
+ messages = [{"role": "user", "content": prompt}]
39
+ formatted_prompt = self.tokenizer.apply_chat_template(
40
+ messages, tokenize=False, add_generation_prompt=True
41
+ )
42
+
43
+ response = generate(
44
+ self.model,
45
+ self.tokenizer,
46
+ prompt=formatted_prompt,
47
+ verbose=False,
48
+ max_tokens=self.max_tokens
49
+ )
50
+ return response
src/generation/model.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/generation/model.py
2
+ import sys
3
+
4
+ def load_model(model_path="mlx-community/Llama-3.2-3B-Instruct-4bit"):
5
+ """
6
+ Loads model conditionally based on environment.
7
+ Local (Mac): Uses MLX for GPU acceleration.
8
+ Cloud (Linux): Uses HuggingFace Transformers (CPU/CUDA).
9
+ """
10
+ try:
11
+ from mlx_lm import load, generate
12
+ print(f"Loading {model_path} with MLX on Apple Silicon...")
13
+ model, tokenizer = load(model_path)
14
+ return model, tokenizer, "mlx"
15
+ except ImportError:
16
+ # Fallback for Docker/Cloud if MLX isn't available
17
+ print("MLX not found. Falling back to Transformers...")
18
+ from transformers import AutoModelForCausalLM, AutoTokenizer
19
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
20
+ model = AutoModelForCausalLM.from_pretrained(model_path)
21
+ return model, tokenizer, "transformers"
22
+
23
+ if __name__ == "__main__":
24
+ import mlx.core as mx
25
+
26
+ # 1. Check Default Device
27
+ device = mx.default_device()
28
+ print(f"✅ Current MLX Device: {device}") # Should say "gpu"
29
+
30
+ # 2. Run Inference to trigger GPU
31
+ model, tokenizer, backend = load_model()
32
+
33
+ if backend == "mlx":
34
+ from mlx_lm import generate
35
+ prompt = "Explain quantum physics in one sentence."
36
+ messages = [{"role": "user", "content": prompt}]
37
+ prompt_formatted = tokenizer.apply_chat_template(messages, tokenize=False)
38
+
39
+ print(f"\n🧪 Testing Inference (Watch your GPU stats now)...")
40
+ response = generate(model, tokenizer, prompt=prompt_formatted, verbose=True)
41
+ print(f"\n🤖 Response: {response}")
42
+
src/ingestion/pipeline.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/ingestion/pipeline.py
2
+ import os
3
+ import mlflow
4
+ import chromadb
5
+ from langchain_community.document_loaders import TextLoader, DirectoryLoader
6
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
7
+ from langchain_huggingface import HuggingFaceEmbeddings
8
+ from langchain_community.vectorstores import Chroma
9
+ from dotenv import load_dotenv
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+
14
+ # Configuration
15
+ DATA_PATH = "data/raw"
16
+ DB_PATH = "data/chroma_db"
17
+ COLLECTION_NAME = "rag_experiments"
18
+ EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
19
+
20
+ class IngestionPipeline:
21
+ def __init__(self):
22
+ self.embeddings = HuggingFaceEmbeddings(
23
+ model_name=EMBEDDING_MODEL,
24
+ model_kwargs={'device': 'mps'} # Use M4 MPS for embeddings
25
+ )
26
+
27
+ def load_documents(self):
28
+ """Loads text files from the data directory."""
29
+ loader = DirectoryLoader(DATA_PATH, glob="*.txt", loader_cls=TextLoader)
30
+ documents = loader.load()
31
+ print(f"📄 Loaded {len(documents)} documents.")
32
+ return documents
33
+
34
+ def chunk_documents(self, documents, chunk_size=1000, chunk_overlap=200):
35
+ """Splits documents into smaller chunks."""
36
+ text_splitter = RecursiveCharacterTextSplitter(
37
+ chunk_size=chunk_size,
38
+ chunk_overlap=chunk_overlap
39
+ )
40
+ chunks = text_splitter.split_documents(documents)
41
+ print(f"🧩 Split into {len(chunks)} chunks.")
42
+ return chunks
43
+
44
+ def store_embeddings(self, chunks):
45
+ """Embeds chunks and stores them in ChromaDB."""
46
+ if os.path.exists(DB_PATH):
47
+ print("⚠️ Existing DB found. Appending...")
48
+
49
+ vectorstore = Chroma.from_documents(
50
+ documents=chunks,
51
+ embedding=self.embeddings,
52
+ persist_directory=DB_PATH,
53
+ collection_name=COLLECTION_NAME
54
+ )
55
+ print(f"💾 Saved to {DB_PATH}")
56
+ return vectorstore
57
+
58
+ def run(self):
59
+ """Runs the full pipeline with MLflow tracking."""
60
+ mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
61
+
62
+ with mlflow.start_run(run_name="Ingestion_Phase_2"):
63
+ # Log Parameters
64
+ mlflow.log_param("embedding_model", EMBEDDING_MODEL)
65
+ mlflow.log_param("chunk_size", 1000)
66
+ mlflow.log_param("chunk_overlap", 200)
67
+
68
+ # Execute Steps
69
+ docs = self.load_documents()
70
+ chunks = self.chunk_documents(docs)
71
+ self.store_embeddings(chunks)
72
+
73
+ # Log Metrics
74
+ mlflow.log_metric("num_documents", len(docs))
75
+ mlflow.log_metric("num_chunks", len(chunks))
76
+
77
+ print("✅ Ingestion complete and logged to Dagshub!")
78
+
79
+ if __name__ == "__main__":
80
+ pipeline = IngestionPipeline()
81
+ pipeline.run()
src/retrieval/rag_chain.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/retrieval/rag_chain.py
2
+ import sys
3
+ import os
4
+ from dotenv import load_dotenv
5
+
6
+ # Add project root to path
7
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
8
+
9
+ import mlflow
10
+ from langchain_chroma import Chroma
11
+ from langchain_huggingface import HuggingFaceEmbeddings
12
+ from langchain_core.prompts import PromptTemplate
13
+ from langchain_core.runnables import RunnablePassthrough
14
+ from langchain_core.output_parsers import StrOutputParser
15
+ from src.generation.mlx_wrapper import MLXLLM
16
+
17
+ load_dotenv()
18
+
19
+ # Configuration from ENV
20
+ DB_PATH = os.getenv("CHROMA_DB_PATH", "data/chroma_db")
21
+ COLLECTION_NAME = os.getenv("CHROMA_COLLECTION_NAME", "rag_experiments")
22
+ EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
23
+
24
+ def format_docs(docs):
25
+ return "\n\n".join(doc.page_content for doc in docs)
26
+
27
+ def build_rag_chain():
28
+ """Builds and returns the RAG chain using LCEL."""
29
+
30
+ # 1. Initialize Embeddings
31
+ embeddings = HuggingFaceEmbeddings(
32
+ model_name=EMBEDDING_MODEL,
33
+ model_kwargs={'device': 'mps'}
34
+ )
35
+
36
+ # 2. Initialize Retriever
37
+ vectorstore = Chroma(
38
+ persist_directory=DB_PATH,
39
+ embedding_function=embeddings,
40
+ collection_name=COLLECTION_NAME
41
+ )
42
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
43
+
44
+ # 3. Initialize LLM (No arguments needed, it pulls from env)
45
+ llm = MLXLLM()
46
+
47
+ # 4. Create Prompt Template
48
+ template = """Use the following pieces of context to answer the question at the end.
49
+ If you don't know the answer, just say that you don't know, don't try to make up an answer.
50
+
51
+ Context: {context}
52
+
53
+ Question: {question}
54
+
55
+ Answer:"""
56
+ custom_prompt = PromptTemplate.from_template(template)
57
+
58
+ # 5. Build LCEL Chain
59
+ rag_chain = (
60
+ {"context": retriever | format_docs, "question": RunnablePassthrough()}
61
+ | custom_prompt
62
+ | llm
63
+ | StrOutputParser()
64
+ )
65
+
66
+ return rag_chain
67
+
68
+ if __name__ == "__main__":
69
+ chain = build_rag_chain()
70
+ query = "What technologies does the RAG platform use?"
71
+
72
+ print(f"\n❓ Query: {query}")
73
+ res = chain.invoke(query)
74
+ print("\n🤖 Answer:")
75
+ print(res)
uv.lock ADDED
The diff for this file is too large to render. See raw diff