Spaces:

satyaki-mitra
/

ContractIntel_AI

Sleeping

App Files Files Community

satyaki-mitra commited on 5 days ago

Commit

1ee1cb7

1 Parent(s): f7bf809

llama-cpp support and hf space integration

Browse files

Files changed (11) hide show

.env.huggingface +128 -0
Dockerfile +31 -11
README.md +99 -5
app.py +5 -3
config/settings.py +274 -43
model_manager/llm_manager.py +421 -223
requirements.txt +8 -1
services/llm_interpreter.py +2 -1
services/negotiation_engine.py +1 -1
services/summary_generator.py +20 -6
utils/document_reader.py +0 -4

.env.huggingface ADDED Viewed

	@@ -0,0 +1,128 @@

+# ============================================
+# HUGGINGFACE SPACES CONFIGURATION (FREE TIER)
+# ============================================
+# Environment Detection
+IS_HUGGINGFACE_SPACE=true
+DEPLOYMENT_ENV=huggingface
+# ============================================
+# LLM PROVIDER CONFIGURATION
+# ============================================
+# Provider Priority (explicit for HF Spaces)
+LLM_PROVIDER_PRIORITY=llama_cpp,openai,anthropic,hf_inference
+LLM_DEFAULT_PROVIDER=llama_cpp
+# Provider Availability
+ENABLE_OLLAMA=false          # Ollama not available on HF Spaces
+ENABLE_LLAMA_CPP=true        # Primary: llama.cpp with GGUF models
+ENABLE_OPENAI=false          # Disabled unless you add API key
+ENABLE_ANTHROPIC=false       # Disabled unless you add API key
+ENABLE_HF_INFERENCE=false    # Disabled unless you enable below
+# ============================================
+# LLAMA.CPP CONFIGURATION (PRIMARY PROVIDER)
+# ============================================
+# Model Selection (Hermes-2-Pro-Llama-3-8B is excellent for legal analysis)
+LLAMA_CPP_MODEL_REPO=NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF
+LLAMA_CPP_MODEL_FILE=Hermes-2-Pro-Llama-3-8B-GGUF.Q4_K_M.gguf
+# CPU-Only Configuration (CRITICAL for free tier)
+LLAMA_CPP_N_GPU_LAYERS=0      # 0 = CPU only
+LLAMA_CPP_N_CTX=4096          # Context window
+LLAMA_CPP_N_BATCH=128         # Smaller batches for CPU memory
+LLAMA_CPP_N_THREADS=4         # CPU threads (optimize for free tier)
+# ============================================
+# LLM GENERATION SETTINGS
+# ============================================
+# Generation Parameters
+LLM_TEMPERATURE=0.1           # Low temperature for consistent legal analysis
+LLM_MAX_TOKENS=1024           # Max tokens per response
+LLM_TOP_P=0.95                # Top-p sampling
+LLM_REPEAT_PENALTY=1.1        # Repeat penalty
+# System Prompt (optimized for legal analysis)
+LLM_SYSTEM_PROMPT="You are a specialized legal contract analyst. Provide concise, accurate analysis focusing on risk identification, clause interpretation, and practical recommendations."
+# ============================================
+# EXTERNAL API FALLBACKS (OPTIONAL)
+# ============================================
+# OpenAI API (optional fallback - add your key in Space Secrets)
+# ENABLE_OPENAI=true
+# OPENAI_API_KEY=sk-xxxxxxx
+# OPENAI_MODEL=gpt-3.5-turbo
+# OPENAI_TIMEOUT=30
+# OPENAI_MAX_TOKENS=1024
+# Anthropic API (optional fallback)
+# ENABLE_ANTHROPIC=true
+# ANTHROPIC_API_KEY=sk-ant-xxxxxxx
+# ANTHROPIC_MODEL=claude-3-haiku-20240307
+# ANTHROPIC_TIMEOUT=30
+# HuggingFace Inference API (optional - uses HF token from environment)
+# ENABLE_HF_INFERENCE=true
+# HF_MODEL_ID=meta-llama/Llama-2-7b-chat-hf
+# HF_API_TOKEN=${HF_TOKEN}  # Automatically provided by HF Spaces
+# ============================================
+# APPLICATION SETTINGS
+# ============================================
+# File Upload Limits
+MAX_UPLOAD_SIZE=10485760      # 10MB (free tier memory consideration)
+ALLOWED_EXTENSIONS=.pdf,.docx,.txt
+# Contract Analysis Limits
+MIN_CONTRACT_LENGTH=300       # Minimum characters
+MAX_CONTRACT_LENGTH=500000    # Maximum characters (500KB)
+# Performance Settings
+MODEL_CACHE_SIZE=2            # Cache 2 models in memory (free tier limit)
+USE_GPU=false                 # Force CPU-only for free tier
+# Logging
+LOG_LEVEL=INFO
+LOG_FILE=/tmp/app.log         # Use tmp for ephemeral storage
+# Cache Settings
+ENABLE_CACHE=true
+CACHE_TTL=3600                # 1 hour cache
+CACHE_DIR=/tmp/cache          # Use tmp for ephemeral storage
+# Model Cache Directory (HF Spaces uses /data for persistence)
+MODEL_CACHE_DIR=/data/models  # CRITICAL: HF Spaces persists /data
+# Rate Limiting (important for free tier)
+RATE_LIMIT_ENABLED=true
+RATE_LIMIT_REQUESTS=5         # Reduced for free tier
+RATE_LIMIT_PERIOD=60          # Per minute
+# ============================================
+# SERVER CONFIGURATION
+# ============================================
+# Server Settings (HF Spaces uses port 7860)
+HOST=0.0.0.0
+PORT=7860                     # HF Spaces default port
+WORKERS=1                     # Single worker for free tier
+RELOAD=false                  # Disable reload in production
+# CORS (configure for your frontend)
+CORS_ORIGINS=["https://*.hf.space", "http://localhost:3000"]
+CORS_ALLOW_CREDENTIALS=true
+CORS_ALLOW_METHODS=["*"]
+CORS_ALLOW_HEADERS=["*"]
+# ============================================
+# PDF REPORT SETTINGS
+# ============================================
+PDF_FONT_SIZE=10
+PDF_MARGIN=0.5
+PDF_PAGE_SIZE=letter

Dockerfile CHANGED Viewed

@@ -2,30 +2,50 @@ FROM python:3.11-slim
 WORKDIR /app
-# Install system dependencies
 RUN apt-get update && apt-get install -y \
     curl \
     wget \
     && rm -rf /var/lib/apt/lists/*
-# Copy requirements and install
 COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
 # Download spaCy model
 RUN python -m spacy download en_core_web_sm
-# Install Ollama
-RUN curl -fsSL https://ollama.ai/install.sh | sh
 # Copy application
 COPY . .
-# Create directories
-RUN mkdir -p uploads cache logs
-# Expose port
 EXPOSE 7860
-# Simple CMD - start Ollama in background, then start FastAPI
-CMD ollama serve & sleep 20 && ollama pull llama3:8b & uvicorn app:app --host 0.0.0.0 --port 7860

 WORKDIR /app
+# Install system dependencies for llama-cpp-python and PDF processing
 RUN apt-get update && apt-get install -y \
     curl \
     wget \
+    git \
+    build-essential \
+    cmake \
+    pkg-config \
+    libopenblas-dev \
+    liblapack-dev \
+    libxml2-dev \
+    libxslt1-dev \
+    zlib1g-dev \
+    libjpeg-dev \
+    libpng-dev \
+    libfreetype6-dev \
     && rm -rf /var/lib/apt/lists/*
+# Copy requirements and install with optimizations
 COPY requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
 # Download spaCy model
 RUN python -m spacy download en_core_web_sm
 # Copy application
 COPY . .
+# Create directories (HF Spaces uses /data for persistent storage)
+RUN mkdir -p uploads cache logs /data/models
+# Expose port (HF Spaces uses 7860 by default)
 EXPOSE 7860
+# Environment variables for CPU-only operation
+ENV LLAMA_CPP_N_GPU_LAYERS=0
+ENV CUDA_VISIBLE_DEVICES=""  # Disable CUDA for free tier
+ENV OMP_NUM_THREADS=4        # Optimize for CPU
+ENV NUMEXPR_MAX_THREADS=4
+# HEALTH CHECK for HF Spaces
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:7860/api/v1/health || exit 1
+# CMD for HuggingFace Spaces (NO Ollama!)
+CMD uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1 --timeout-keep-alive 30

README.md CHANGED Viewed

@@ -22,12 +22,21 @@ license: mit
 [![Legal-BERT](https://img.shields.io/badge/Legal--BERT-nlpaueb/legal--bert--base--uncased-orange)](https://huggingface.co/nlpaueb/legal-bert-base-uncased)
 [![Sentence-BERT](https://img.shields.io/badge/Sentence--BERT-all--MiniLM--L6--v2-lightgrey)](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
 [![Ollama](https://img.shields.io/badge/Ollama-llama3:8b-7c3aed)](https://ollama.ai/)
 [![Docker](https://img.shields.io/badge/Docker-Ready-2496ed)](https://docker.com/)
 [![spaCy](https://img.shields.io/badge/spaCy-3.7+-09a3d5)](https://spacy.io/)
 > **Democratizing Legal Intelligence Through AI**
 > Comprehensive contract risk analysis using an integrated pipeline with Legal-BERT, multi-model NLP, and LLM interpretation
 </div>
 ## 🎯 Overview
@@ -39,8 +48,8 @@ The AI Contract Risk Analyzer is a production-grade legal document analysis plat
 - 📄 **Multi-Format Support**: PDF, DOCX, TXT document processing
 - 🔍 **9 Contract Categories**: Employment, NDA, Lease, Service agreements, etc.
 - ⚡ **Sub-60s Analysis**: Real-time risk scoring and clause extraction via pre-loaded models
-- 🔒 **Privacy-First**: Ephemeral processing, zero data retention
-- 🌐 **LLM Integration**: Ollama (local), OpenAI, Anthropic support with fallback
 - 📊 **Comprehensive Reports**: Executive summaries, negotiation playbooks, market comparisons, and downloadable PDFs
 - 🔄 **Integrated Pipeline**: A single orchestrator (`PreloadedAnalysisService`) ensures consistent context propagation from classification through to final reporting
@@ -108,7 +117,8 @@ This diagram illustrates the core components and their interactions, highlightin
 │  └─────────────────────────────────────────────────────┘   │
 │  ┌─────────────────────────────────────────────────────┐   │
 │  │ LLM Manager (Multi-Provider)                        │   │
-│  │ - Ollama (Local, Free)                              │   │
 │  │ - OpenAI (GPT-3.5/4)                                │   │
 │  │ - Anthropic (Claude)                                │   │
 │  │ - Auto-Fallback & Rate Limiting                     │   │
@@ -243,7 +253,7 @@ graph LR
 ---
-## 🚀 Installation
 ### Prerequisites
@@ -255,6 +265,38 @@ Storage: 10GB for models
 GPU: Optional (3x speedup with NVIDIA GPU + CUDA 11.8+)
 ```
 ### Quick Install
 ```bash
@@ -333,6 +375,16 @@ python app.py
 uvicorn app:app --reload --host 0.0.0.0 --port 8000
 ```
 ---
 ## 🔧 Technical Details
@@ -346,10 +398,17 @@ Legal-BERT: nlpaueb/legal-bert-base-uncased  # 110M params, 768-dim
 Sentence-BERT: all-MiniLM-L6-v2              # 22M params, 384-dim
 # LLM Integration
-Ollama: llama3:8b (local, free)
 OpenAI: gpt-3.5-turbo, gpt-4
 Anthropic: claude-3-sonnet, claude-3-opus
 # Deep Learning Framework
 PyTorch: 2.1+
 Transformers: 4.35+ (Hugging Face)
@@ -494,8 +553,43 @@ Sentence-BERT Model: ~100MB
 LLM Manager: ~50MB
 Total (Idle): ~600MB
 Total (Peak): ~1.2GB
 ```
 ---
 ## 📝 License

 [![Legal-BERT](https://img.shields.io/badge/Legal--BERT-nlpaueb/legal--bert--base--uncased-orange)](https://huggingface.co/nlpaueb/legal-bert-base-uncased)
 [![Sentence-BERT](https://img.shields.io/badge/Sentence--BERT-all--MiniLM--L6--v2-lightgrey)](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
 [![Ollama](https://img.shields.io/badge/Ollama-llama3:8b-7c3aed)](https://ollama.ai/)
+[![Llama.cpp](https://img.shields.io/badge/Llama.cpp-GGUF_Models-4B5563)](https://github.com/ggerganov/llama.cpp)
 [![Docker](https://img.shields.io/badge/Docker-Ready-2496ed)](https://docker.com/)
 [![spaCy](https://img.shields.io/badge/spaCy-3.7+-09a3d5)](https://spacy.io/)
 > **Democratizing Legal Intelligence Through AI**
 > Comprehensive contract risk analysis using an integrated pipeline with Legal-BERT, multi-model NLP, and LLM interpretation
+> **⚠️ Important Disclaimer**: This tool provides AI-assisted contract analysis and is not a substitute for professional legal advice. Always consult a qualified attorney for legal matters. The AI may produce inaccurate or incomplete analyses.
+> **🔐 Data Privacy**: Choose your deployment carefully:
+> - **Local deployment** (Ollama/Llama.cpp) = Maximum privacy
+> - **Cloud deployment** = Files processed on external servers
+> - **API providers** (OpenAI/Anthropic) = Contract text sent to third parties
 </div>
 ## 🎯 Overview
 - 📄 **Multi-Format Support**: PDF, DOCX, TXT document processing
 - 🔍 **9 Contract Categories**: Employment, NDA, Lease, Service agreements, etc.
 - ⚡ **Sub-60s Analysis**: Real-time risk scoring and clause extraction via pre-loaded models
+- 🛡️ **Privacy-Flexible**: Choose between 100% local (Ollama), local models on cloud (llama.cpp), or external APIs
+- 🌐 **Multi-Provider LLM**: Ollama (100% local), llama.cpp (local GGUF models), OpenAI, Anthropic with fallback
 - 📊 **Comprehensive Reports**: Executive summaries, negotiation playbooks, market comparisons, and downloadable PDFs
 - 🔄 **Integrated Pipeline**: A single orchestrator (`PreloadedAnalysisService`) ensures consistent context propagation from classification through to final reporting
 │  └─────────────────────────────────────────────────────┘   │
 │  ┌─────────────────────────────────────────────────────┐   │
 │  │ LLM Manager (Multi-Provider)                        │   │
+│  │ - Ollama (Local, Free)                              |   |
+|  | - Llama.cpp (GGUF Models, CPU/GPU)                  │   │
 │  │ - OpenAI (GPT-3.5/4)                                │   │
 │  │ - Anthropic (Claude)                                │   │
 │  │ - Auto-Fallback & Rate Limiting                     │   │
 ---
+## 🚀 Installation Options
 ### Prerequisites
 GPU: Optional (3x speedup with NVIDIA GPU + CUDA 11.8+)
 ```
+### Installation Options
+Choose based on your privacy and hardware requirements:
+#### 🔒 Option A: Maximum Privacy (Local Ollama)
+```bash
+# For complete local processing
+pip install -r requirements.txt
+ollama serve
+ollama pull llama3:8b
+```
+#### 💻 Option B: Good Privacy + CPU Support (Local Llama.cpp)
+```bash
+# For systems without GPU or Ollama
+pip install llama-cpp-python huggingface-hub
+# Models downloaded automatically on first run
+```
+#### ☁️ Option C: Free Cloud (HuggingFace Spaces)
+```bash
+# No installation needed
+# Visit: https://huggingface.co/spaces/[your-space]
+# Models automatically downloaded, runs on HF infrastructure
+```
+#### 🌐 Option D: External APIs (Best Quality)
+```bash
+# Add API keys to .env for OpenAI/Anthropic
+# Models run on external servers
+```
 ### Quick Install
 ```bash
 uvicorn app:app --reload --host 0.0.0.0 --port 8000
 ```
+### Deployment Options Summary
+| Option | Privacy | Setup | Best For |
+|--------|---------|-------|----------|
+| **Local Ollama** | 🔒 Maximum | Medium | Sensitive contracts |
+| **Local Llama.cpp** | 🔒 High | Easy | General use, CPU-only |
+| **HF Spaces** | 🟡 Medium | Trivial | Demos, testing |
+| **External APIs** | 🟡 Medium | Easy | Non-sensitive, best quality |
 ---
 ## 🔧 Technical Details
 Sentence-BERT: all-MiniLM-L6-v2              # 22M params, 384-dim
 # LLM Integration
+Ollama: llama3:8b (100% local, maximum privacy)
+Llama.cpp: GGUF models (local models on CPU/GPU)
 OpenAI: gpt-3.5-turbo, gpt-4
 Anthropic: claude-3-sonnet, claude-3-opus
+# Privacy Levels:
+1. Ollama → 100% local, no data leaves
+2. Llama.cpp → Models run locally on your hardware
+3. OpenAI/Anthropic → Data sent to external servers
 # Deep Learning Framework
 PyTorch: 2.1+
 Transformers: 4.35+ (Hugging Face)
 LLM Manager: ~50MB
 Total (Idle): ~600MB
 Total (Peak): ~1.2GB
+```
+---
+## 🔒 Privacy & Data Safety
+### Data Handling by Deployment Type
+| Deployment | Privacy Level | Where Models Run | Where Files Go | Best For |
+|------------|---------------|------------------|----------------|----------|
+| **Local Ollama** | 🔒 Maximum | Your machine | Your machine only | Sensitive NDAs, employment |
+| **Local Llama.cpp** | 🔒 High | Your machine | Your machine only | General contracts, CPU-only |
+| **HuggingFace Spaces** | 🟡 Medium | HF servers | Temporary HF storage | Testing, public demos |
+| **External APIs** | 🟡 Medium | OpenAI/Anthropic | Sent to 3rd parties | Non-sensitive contracts |
+### Configuration for Different Privacy Needs
+**For Maximum Privacy (Legal Firms, Sensitive Data):**
+```env
+ENABLE_OLLAMA=true      # 100% local
+ENABLE_LLAMA_CPP=true   # Local GGUF models
+ENABLE_OPENAI=false     # No external data
+ENABLE_ANTHROPIC=false  # No external data
 ```
+**For Public Demos (HuggingFace Spaces):**
+```env
+ENABLE_OLLAMA=false     # Not available on HF
+ENABLE_LLAMA_CPP=true   # Local models on HF servers
+ENABLE_OPENAI=false     # Optional if API key added
+ENABLE_ANTHROPIC=false  # Optional if API key added
+```
+> ⚠️ Important: No deployment option provides attorney-client privilege. Always consult a lawyer for legal advice.
 ---
 ## 📝 License

app.py CHANGED Viewed

@@ -298,7 +298,7 @@ class PreloadedAnalysisService:
             try:
                 # Initialize with LLM manager - ensure constructor args match
                 self.services["negotiation_engine"]       = NegotiationEngine(llm_manager      = self.llm_manager,
-                                                                              default_provider = LLMProvider.OLLAMA,
                                                                              )
                 self.service_status["negotiation_engine"] = "loaded"
@@ -314,7 +314,9 @@ class PreloadedAnalysisService:
             log_info("🔄 Pre-loading Summary Generator...")
             try:
                 # Initialize with LLM manager
-                self.services["summary_generator"]       = SummaryGenerator(llm_manager = self.llm_manager)
                 self.service_status["summary_generator"] = "loaded"
                 log_info("✅ Summary Generator loaded")
@@ -594,7 +596,7 @@ class PreloadedAnalysisService:
                                                                                                        contract_type       = contract_type_enum,
                                                                                                        overall_risk_score  = risk_score.overall_score,
                                                                                                        max_clauses         = len(clauses),
-                                                                                                       provider            = LLMProvider.OLLAMA,
                                                                                                       )
                     log_info("LLM risk interpretation generated")

             try:
                 # Initialize with LLM manager - ensure constructor args match
                 self.services["negotiation_engine"]       = NegotiationEngine(llm_manager      = self.llm_manager,
+                                                                              default_provider = None,
                                                                              )
                 self.service_status["negotiation_engine"] = "loaded"
             log_info("🔄 Pre-loading Summary Generator...")
             try:
                 # Initialize with LLM manager
+                self.services["summary_generator"]       = SummaryGenerator(llm_manager      = self.llm_manager,
+                                                                            default_provider = None,
+                                                                           )
                 self.service_status["summary_generator"] = "loaded"
                 log_info("✅ Summary Generator loaded")
                                                                                                        contract_type       = contract_type_enum,
                                                                                                        overall_risk_score  = risk_score.overall_score,
                                                                                                        max_clauses         = len(clauses),
+                                                                                                       provider            = None,
                                                                                                       )
                     log_info("LLM risk interpretation generated")

config/settings.py CHANGED Viewed

@@ -1,7 +1,10 @@
 # DEPENDENCIES
 from pathlib import Path
 from pydantic import Field
 from typing import Optional
 from pydantic_settings import BaseSettings
@@ -10,66 +13,112 @@ class Settings(BaseSettings):
     Application-wide settings: primary configuration source
     """
     # Application Info
-    APP_NAME               : str            = "AI Contract Risk Analyzer"
-    APP_VERSION            : str            = "1.0.0"
-    API_PREFIX             : str            = "/api/v1/"
     # Server Configuration
-    HOST                   : str            = "0.0.0.0"
-    PORT                   : int            = 8000
-    RELOAD                 : bool           = True
-    WORKERS                : int            = 1
     # CORS Settings
-    CORS_ORIGINS           : list           = ["http://localhost:3000", "http://localhost:8000", "http://127.0.0.1:8000"]
-    CORS_ALLOW_CREDENTIALS : bool           = True
-    CORS_ALLOW_METHODS     : list           = ["*"]
-    CORS_ALLOW_HEADERS     : list           = ["*"]
     # File Upload Settings
-    MAX_UPLOAD_SIZE        : int            = 10 * 1024 * 1024  # 10 MB
-    ALLOWED_EXTENSIONS     : list           = [".pdf", ".docx", ".txt"]
-    UPLOAD_DIR             : Path           = Path("uploads")
     # Model Management Settings
-    MODEL_CACHE_SIZE       : int            = 3     # Number of models to keep in memory
-    MODEL_DOWNLOAD_TIMEOUT : int            = 1800  # 30 minutes
-    USE_GPU                : bool           = True  # Automatically detect and use GPU if available
     # External API Settings
-    OLLAMA_BASE_URL        : str            = "http://localhost:11434"
-    OLLAMA_MODEL           : str            = "llama3:8b"
-    OLLAMA_TIMEOUT         : int            = 300
-    OLLAMA_TEMPERATURE     : float          = 0.1
-    # External API Keys
-    OPENAI_API_KEY         : Optional[str]  = None
-    ANTHROPIC_API_KEY      : Optional[str]  = None
     # Analysis Limits
-    MIN_CONTRACT_LENGTH    : int            = 300    # Minimum characters for valid contract
-    MAX_CONTRACT_LENGTH    : int            = 500000 # Maximum characters (500KB text)
-    MAX_CLAUSES_TO_ANALYZE : int            = 100
     # Logging Settings
-    LOG_LEVEL              : str            = "INFO"
-    LOG_FORMAT             : str            = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-    LOG_FILE               : Optional[Path] = Path("logs/app.log")
     # Cache Settings
-    ENABLE_CACHE           : bool           = True
-    CACHE_TTL              : int            = 3600 # 1 hour
-    CACHE_DIR              : Path           = Path("cache")
-    # Rate Limiting Settings
-    RATE_LIMIT_ENABLED     : bool           = True
-    RATE_LIMIT_REQUESTS    : int            = 10
-    RATE_LIMIT_PERIOD      : int            = 60  # seconds
-    # PDF Report Settings
-    PDF_FONT_SIZE          : int            = 10
-    PDF_MARGIN             : float          = 0.5 # inches
-    PDF_PAGE_SIZE          : str            = "letter"
     class Config:
@@ -78,14 +127,196 @@ class Settings(BaseSettings):
         case_sensitive    = True
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        # Ensure directories exist
         self.UPLOAD_DIR.mkdir(parents = True, exist_ok = True)
-        self.CACHE_DIR.mkdir(parents = True, exist_ok = True)
         if self.LOG_FILE:
             self.LOG_FILE.parent.mkdir(parents = True, exist_ok = True)
 # Global settings instance

 # DEPENDENCIES
+import os
 from pathlib import Path
 from pydantic import Field
+from typing import Literal
 from typing import Optional
+from pydantic import field_validator
 from pydantic_settings import BaseSettings
     Application-wide settings: primary configuration source
     """
     # Application Info
+    APP_NAME               : str                                                = "AI Contract Risk Analyzer"
+    APP_VERSION            : str                                                = "1.0.0"
+    API_PREFIX             : str                                                = "/api/v1/"
     # Server Configuration
+    HOST                   : str                                                = "0.0.0.0"
+    PORT                   : int                                                = 8000
+    RELOAD                 : bool                                               = True
+    WORKERS                : int                                                = 1
     # CORS Settings
+    CORS_ORIGINS           : list                                               = ["http://localhost:3000", "http://localhost:8000", "http://127.0.0.1:8000"]
+    CORS_ALLOW_CREDENTIALS : bool                                               = True
+    CORS_ALLOW_METHODS     : list                                               = ["*"]
+    CORS_ALLOW_HEADERS     : list                                               = ["*"]
     # File Upload Settings
+    MAX_UPLOAD_SIZE        : int                                                = 10 * 1024 * 1024  # 10 MB
+    ALLOWED_EXTENSIONS     : list                                               = [".pdf", ".docx", ".txt"]
+    UPLOAD_DIR             : Path                                               = Path("uploads")
     # Model Management Settings
+    MODEL_CACHE_SIZE       : int                                                = 3     # Number of models to keep in memory
+    MODEL_DOWNLOAD_TIMEOUT : int                                                = 1800  # 30 minutes
+    USE_GPU                : bool                                               = True  # Automatically detect and use GPU if available
+    # Environment Detection Settings
+    IS_HUGGINGFACE_SPACE   : bool                                               = False  # Auto-detected
+    IS_LOCAL               : bool                                               = True   # Auto-detected
+    DEPLOYMENT_ENV         : Literal["local", "huggingface", "docker", "cloud"] = "local"
+    # LLAMA.CPP Settings (For HF Spaces)
+    LLAMA_CPP_ENABLED      : bool                                               = False  # Auto-enabled in HF Spaces
+    LLAMA_CPP_MODEL_PATH   : Optional[Path]                                     = None   # Local path to GGUF model
+    LLAMA_CPP_MODEL_REPO   : str                                                = "NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF"
+    LLAMA_CPP_MODEL_FILE   : str                                                = "Hermes-2-Pro-Llama-3-8B-GGUF.Q4_K_M.gguf"
+    LLAMA_CPP_N_CTX        : int                                                = 4096   # Context window
+    LLAMA_CPP_N_GPU_LAYERS : int                                                = -1     # -1 = all layers on GPU
+    LLAMA_CPP_N_BATCH      : int                                                = 512    # Batch size for prompt processing
+    LLAMA_CPP_N_THREADS    : int                                                = 4      # CPU threads (0 = auto)
+    # Ollama Settings (For Local)
+    OLLAMA_BASE_URL        : str                                                = "http://localhost:11434"
+    OLLAMA_MODEL           : str                                                = "llama3:8b"
+    OLLAMA_TIMEOUT         : int                                                = 300
+    OLLAMA_TEMPERATURE     : float                                              = 0.1
     # External API Settings
+    OPENAI_API_KEY         : Optional[str]                                      = None
+    OPENAI_MODEL           : str                                                = "gpt-3.5-turbo"
+    OPENAI_TIMEOUT         : int                                                = 30
+    OPENAI_TEMPERATURE     : float                                              = 0.1
+    OPENAI_MAX_TOKENS      : int                                                = 1024
+    ANTHROPIC_API_KEY      : Optional[str]                                      = None
+    ANTHROPIC_MODEL        : str                                                = "claude-3-haiku-20240307"
+    ANTHROPIC_TIMEOUT      : int                                                = 30
+    ANTHROPIC_TEMPERATURE  : float                                              = 0.1
+    ANTHROPIC_MAX_TOKENS   : int                                                = 1024
+    # Priority order for LLM providers
+    LLM_PROVIDER_PRIORITY  : list                                               = ["ollama", "openai", "anthropic", "llama_cpp"]
+    # Which providers are available
+    ENABLE_OLLAMA          : bool                                               = True
+    ENABLE_LLAMA_CPP       : bool                                               = False  # Auto-enabled in HF Spaces
+    ENABLE_OPENAI          : bool                                               = False
+    ENABLE_ANTHROPIC       : bool                                               = False
+    ENABLE_HF_INFERENCE    : bool                                               = False  # HuggingFace Inference API
+    # Default provider (auto-selected based on environment)
+    LLM_DEFAULT_PROVIDER   : str                                                = "ollama"
+    # Huggingface Inference Settings (Optional)
+    HF_MODEL_ID            : Optional[str]                                      = None   # e.g. "meta-llama/Llama-2-7b-chat-hf"
+    HF_API_TOKEN           : Optional[str]                                      = None   # HF token for gated models
+    # LLM Generation Settings (Shared across providers)
+    LLM_TEMPERATURE        : float                                              = 0.1      # Default for all providers
+    LLM_MAX_TOKENS         : int                                                = 1024     # Default for all providers
+    LLM_TOP_P              : float                                              = 0.95     # Default top-p sampling
+    LLM_REPEAT_PENALTY     : float                                              = 1.1      # Default repeat penalty
+    LLM_SYSTEM_PROMPT      : str                                                = "You are a helpful legal assistant specializing in contract analysis and risk assessment."
     # Analysis Limits
+    MIN_CONTRACT_LENGTH    : int                                                = 300    # Minimum characters for valid contract
+    MAX_CONTRACT_LENGTH    : int                                                = 500000 # Maximum characters (500KB text)
+    MAX_CLAUSES_TO_ANALYZE : int                                                = 100
     # Logging Settings
+    LOG_LEVEL              : str                                                = "INFO"
+    LOG_FORMAT             : str                                                = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    LOG_FILE               : Optional[Path]                                     = Path("logs/app.log")
     # Cache Settings
+    ENABLE_CACHE           : bool                                               = True
+    CACHE_TTL              : int                                                = 3600 # 1 hour
+    CACHE_DIR              : Path                                               = Path("cache")
+    # Model Cache Directory (for llama.cpp models)
+    MODEL_CACHE_DIR        : Path                                               = Path("data/models")
+    # Rate Limiting Settings
+    RATE_LIMIT_ENABLED     : bool                                               = False
+    RATE_LIMIT_REQUESTS    : int                                                = 10
+    RATE_LIMIT_PERIOD      : int                                                = 60  # seconds
     class Config:
         case_sensitive    = True
+    @field_validator('IS_HUGGINGFACE_SPACE', 'IS_LOCAL', 'DEPLOYMENT_ENV', mode = 'before')
+    def detect_environment(cls, v, info):
+        """
+        Auto-detect deployment environment
+        """
+        field_name = info.field_name
+        if (field_name == 'IS_HUGGINGFACE_SPACE'):
+            return bool(os.getenv('SPACE_ID'))
+        elif (field_name == 'IS_LOCAL'):
+            # Check if not in any container/cloud environment
+            return not any([os.getenv('SPACE_ID'),
+                            os.getenv('DOCKER_CONTAINER'),
+                            os.getenv('KUBERNETES_SERVICE_HOST'),
+                            os.getenv('AWS_EXECUTION_ENV')
+                          ])
+        elif (field_name == 'DEPLOYMENT_ENV'):
+            if os.getenv('SPACE_ID'):
+                return "huggingface"
+            elif os.getenv('DOCKER_CONTAINER'):
+                return "docker"
+            elif os.getenv('KUBERNETES_SERVICE_HOST'):
+                return "kubernetes"
+            elif os.getenv('AWS_EXECUTION_ENV'):
+                return "aws"
+            else:
+                return "local"
+        return v
+    @field_validator('ENABLE_LLAMA_CPP', 'LLAMA_CPP_ENABLED', mode = 'after')
+    def enable_llama_cpp_for_hf(cls, v, info):
+        """
+        Auto-enable llama.cpp for HuggingFace Spaces
+        """
+        values = info.data
+        if values.get('IS_HUGGINGFACE_SPACE'):
+            return True
+        return v
+    @field_validator('ENABLE_OLLAMA', mode = 'after')
+    def disable_ollama_for_hf(cls, v, info):
+        """
+        Auto-disable Ollama for HuggingFace Spaces
+        """
+        values = info.data
+        if values.get('IS_HUGGINGFACE_SPACE'):
+            return False
+        return v
+    @field_validator('LLM_PROVIDER_PRIORITY', mode='after')
+    def adjust_provider_priority(cls, v, info):
+        """
+        Adjust provider priority based on environment
+        """
+        values = info.data
+        if values.get('IS_HUGGINGFACE_SPACE'):
+            # For HF Spaces: llama_cpp first, then external APIs
+            priority = []
+            if (values.get('ENABLE_LLAMA_CPP')):
+                priority.append("llama_cpp")
+            if (values.get('ENABLE_HF_INFERENCE') and values.get('HF_API_TOKEN')):
+                priority.append("hf_inference")
+            if (values.get('ENABLE_OPENAI') and values.get('OPENAI_API_KEY')):
+                priority.append("openai")
+            if (values.get('ENABLE_ANTHROPIC') and values.get('ANTHROPIC_API_KEY')):
+                priority.append("anthropic")
+            return priority if priority else ["llama_cpp"]
+        else:
+            # For local: Ollama first
+            priority = list()
+            if values.get('ENABLE_OLLAMA'):
+                priority.append("ollama")
+            if values.get('ENABLE_LLAMA_CPP'):
+                priority.append("llama_cpp")
+            if values.get('ENABLE_OPENAI') and values.get('OPENAI_API_KEY'):
+                priority.append("openai")
+            if values.get('ENABLE_ANTHROPIC') and values.get('ANTHROPIC_API_KEY'):
+                priority.append("anthropic")
+            return priority if priority else ["ollama"]
+    @field_validator('LLM_DEFAULT_PROVIDER', mode='after')
+    def set_default_provider(cls, v, info):
+        """
+        Set default provider based on availability
+        """
+        values = info.data
+        # Get the priority list (after adjustments)
+        priority = values.get('LLM_PROVIDER_PRIORITY', [])
+        if priority:
+            # First available provider is default
+            return priority[0]
+        # Fallback
+        return "ollama"
+    @field_validator('MODEL_CACHE_DIR')
+    def set_model_cache_dir(cls, v, info):
+        """
+        Set appropriate model cache directory based on environment
+        """
+        values = info.data
+        if (values.get('IS_HUGGINGFACE_SPACE')):
+            # HF Spaces have persistent /data directory
+            return Path("/data/models")
+        elif (values.get('DEPLOYMENT_ENV') == "docker"):
+            # Docker containers
+            return Path("/app/models")
+        else:
+            # Local development
+            return Path("models")
+    @field_validator('LLAMA_CPP_N_GPU_LAYERS')
+    def optimize_gpu_layers(cls, v, info):
+        """
+        Auto-optimize GPU layers for different environments
+        """
+        values = info.data
+        if values.get('IS_HUGGINGFACE_SPACE'):
+            # HF Spaces: T4 GPU with 15-16GB VRAM
+            # For 8B Q4 model: ~20 layers is safe
+            return 20
+        elif v == -1:  # -1 means "use all layers"
+            # For local with sufficient GPU
+            return -1
+        else:
+            # Explicit value from config
+            return v
+    @field_validator('LLAMA_CPP_MODEL_PATH')
+    def set_default_model_path(cls, v, info):
+        """
+        Set default model path if not specified
+        """
+        values = info.data
+        if v is None and values.get('LLAMA_CPP_MODEL_FILE'):
+            cache_dir = values.get('MODEL_CACHE_DIR', Path("models"))
+            return cache_dir / values['LLAMA_CPP_MODEL_FILE']
+        return v
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
+        # Ensure Directories Exist
         self.UPLOAD_DIR.mkdir(parents = True, exist_ok = True)
+        self.CACHE_DIR.mkdir(parents=True, exist_ok = True)
+        self.MODEL_CACHE_DIR.mkdir(parents = True, exist_ok = True)
         if self.LOG_FILE:
             self.LOG_FILE.parent.mkdir(parents = True, exist_ok = True)
 # Global settings instance

model_manager/llm_manager.py CHANGED Viewed

@@ -3,6 +3,7 @@ import sys
 import json
 import time
 import requests
 from enum import Enum
 from typing import Any
 from typing import Dict
@@ -37,14 +38,24 @@ try:
 except ImportError:
     ANTHROPIC_AVAILABLE = False
 class LLMProvider(Enum):
     """
     Supported LLM providers
     """
-    OLLAMA    = "ollama"
-    OPENAI    = "openai"
-    ANTHROPIC = "anthropic"
 @dataclass
@@ -78,16 +89,16 @@ class LLMResponse:
 class LLMManager:
     """
-    Unified LLM manager for multiple providers : handles Ollama (local), OpenAI API, and Anthropic API
     """
-    def __init__(self, default_provider: LLMProvider = LLMProvider.OLLAMA, ollama_base_url: Optional[str] = None,
                  openai_api_key: Optional[str] = None, anthropic_api_key: Optional[str] = None):
         """
         Initialize LLM Manager
         Arguments:
         ----------
-            default_provider  : Default LLM provider to use
             ollama_base_url   : Ollama server URL (default: from settings)
@@ -95,7 +106,7 @@ class LLMManager:
             anthropic_api_key : Anthropic API key (or set ANTHROPIC_API_KEY env var)
         """
-        self.default_provider   = default_provider
         self.logger             = ContractAnalyzerLogger.get_logger()
         # Configuration Variables Initialization
@@ -122,24 +133,44 @@ class LLMManager:
         else:
             self.anthropic_client = None
         # Rate limiting (simple token bucket)
         self._rate_limit_tokens      = settings.RATE_LIMIT_REQUESTS
         self._rate_limit_last_refill = time.time()
         self._rate_limit_refill_rate = settings.RATE_LIMIT_REQUESTS / settings.RATE_LIMIT_PERIOD
-        # Generation settings
-        self.generation_config = self.config.LLM_GENERATION
         log_info("LLMManager initialized",
-                 default_provider    = default_provider.value,
-                 ollama_base_url     = self.ollama_base_url,
-                 ollama_model        = self.ollama_model,
-                 ollama_timeout      = self.ollama_timeout,
-                 ollama_temperature  = self.ollama_temperature,
-                 openai_available    = OPENAI_AVAILABLE and bool(self.openai_api_key),
-                 anthropic_available = ANTHROPIC_AVAILABLE and bool(self.anthropic_api_key),
-                 rate_limit_requests = settings.RATE_LIMIT_REQUESTS,
-                 rate_limit_period   = settings.RATE_LIMIT_PERIOD,
                 )
@@ -148,6 +179,9 @@ class LLMManager:
         """
         Check if Ollama server is available
         """
         try:
             response  = requests.get(f"{self.ollama_base_url}/api/tags", timeout = 30)
             available = (response.status_code == 200)
@@ -165,20 +199,35 @@ class LLMManager:
     def get_available_providers(self) -> List[LLMProvider]:
         """
-        Get list of available providers
         """
         available = list()
-        if self._check_ollama_available():
             available.append(LLMProvider.OLLAMA)
-        if OPENAI_AVAILABLE and self.openai_api_key:
             available.append(LLMProvider.OPENAI)
-        if ANTHROPIC_AVAILABLE and self.anthropic_api_key:
             available.append(LLMProvider.ANTHROPIC)
-        log_info("Available LLM providers", providers = [p.value for p in available])
         return available
@@ -188,8 +237,11 @@ class LLMManager:
         """
         Check if rate limit allows request (simple token bucket)
         """
-        now         = time.time()
-        time_passed = now - self._rate_limit_last_refill
         # Refill tokens
         self._rate_limit_tokens      = min(settings.RATE_LIMIT_REQUESTS, self._rate_limit_tokens + time_passed * self._rate_limit_refill_rate)
@@ -197,7 +249,6 @@ class LLMManager:
         if (self._rate_limit_tokens >= 1):
             self._rate_limit_tokens -= 1
             return True
         log_info("Rate limit hit, waiting...", tokens_remaining = self._rate_limit_tokens)
@@ -216,10 +267,10 @@ class LLMManager:
     # UNIFIED COMPLETION METHOD
     @ContractAnalyzerLogger.log_execution_time("llm_complete")
     def complete(self, prompt: str, provider: Optional[LLMProvider] = None, model: Optional[str] = None, temperature: Optional[float] = None,
-                 max_tokens: Optional[int] = None, system_prompt: Optional[str] = None, json_mode: bool = False, retry_on_error: bool = True,
-                 fallback_providers: Optional[List[LLMProvider]] = None) -> LLMResponse:
         """
-        Unified completion method for all providers
         Arguments:
         ----------
@@ -229,9 +280,9 @@ class LLMManager:
             model              : Model name (provider-specific)
-            temperature        : Sampling temperature (0.0-1.0, default from settings/config)
-            max_tokens         : Maximum tokens to generate (default from config)
             system_prompt      : System prompt (if supported)
@@ -239,15 +290,16 @@ class LLMManager:
             retry_on_error     : Retry with fallback providers on error
-            fallback_providers : List of fallback providers to try
         Returns:
         --------
             { LLMResponse }    : LLMResponse object
         """
-        provider    = provider or self.default_provider
-        temperature = temperature or self.ollama_temperature
-        max_tokens  = max_tokens or self.generation_config["max_tokens"]
         log_info("LLM completion request",
                  provider      = provider.value,
@@ -260,74 +312,116 @@ class LLMManager:
         # Rate limiting
         self._wait_for_rate_limit()
-        # Try primary provider
-        try:
-            if (provider == LLMProvider.OLLAMA):
-                return self._complete_ollama(prompt        = prompt,
-                                             model         = model,
-                                             temperature   = temperature,
-                                             max_tokens    = max_tokens,
-                                             system_prompt = system_prompt,
-                                             json_mode     = json_mode,
-                                            )
-            elif (provider == LLMProvider.OPENAI):
-                return self._complete_openai(prompt        = prompt,
-                                             model         = model,
-                                             temperature   = temperature,
-                                             max_tokens    = max_tokens,
-                                             system_prompt = system_prompt,
-                                             json_mode     = json_mode,
-                                            )
-            elif (provider == LLMProvider.ANTHROPIC):
-                return self._complete_anthropic(prompt        = prompt,
-                                                model         = model,
-                                                temperature   = temperature,
-                                                max_tokens    = max_tokens,
-                                                system_prompt = system_prompt,
-                                               )
-            else:
-                raise ValueError(f"Unsupported provider: {provider}")
-        except Exception as e:
-            log_error(e, context = {"component" : "LLMManager", "operation" : "complete", "provider" : provider.value})
-            # Try fallback providers
-            if (retry_on_error and fallback_providers):
-                log_info("Trying fallback providers", fallbacks = [p.value for p in fallback_providers])
-                for fallback_provider in fallback_providers:
-                    if (fallback_provider == provider):
-                        continue
-                    try:
-                        log_info(f"Attempting fallback to {fallback_provider.value}")
-                        # Prevent infinite recursion
-                        return self.complete(prompt         = prompt,
-                                             provider       = fallback_provider,
-                                             model          = model,
-                                             temperature    = temperature,
-                                             max_tokens     = max_tokens,
-                                             system_prompt  = system_prompt,
-                                             json_mode      = json_mode,
-                                             retry_on_error = False,
-                                            )
-                    except Exception as fallback_error:
-                        log_error(fallback_error, context = {"component" : "LLMManager", "operation" : "fallback_complete", "provider" : fallback_provider.value})
-                        continue
-            # All attempts failed
-            return LLMResponse(text            = "",
-                               provider        = provider.value,
-                               model           = model or "unknown",
-                               tokens_used     = 0,
-                               latency_seconds = 0.0,
-                               success         = False,
-                               error_message   = str(e),
-                              )
     # OLLAMA Provider
@@ -335,6 +429,9 @@ class LLMManager:
         """
         Complete using local Ollama
         """
         start_time  = time.time()
         model       = model or self.ollama_model
@@ -359,7 +456,11 @@ class LLMManager:
                  json_mode = json_mode,
                 )
-        response       = requests.post(f"{self.ollama_base_url}/api/generate", json = payload, timeout = self.ollama_timeout)
         response.raise_for_status()
         result         = response.json()
@@ -391,11 +492,14 @@ class LLMManager:
         """
         Complete using OpenAI API
         """
         if not OPENAI_AVAILABLE or not self.openai_api_key:
             raise ValueError("OpenAI not available. Install with: pip install openai")
         start_time = time.time()
-        model      = model or "gpt-3.5-turbo"
         # Construct messages
         messages   = list()
@@ -443,11 +547,14 @@ class LLMManager:
         """
         Complete using Anthropic (Claude) API
         """
         if not ANTHROPIC_AVAILABLE or not self.anthropic_client:
             raise ValueError("Anthropic not available. Install with: pip install anthropic")
-        start_time = time.time()
-        model      = model or "claude-3-sonnet-20240229"
         log_info("Calling Anthropic API", model = model)
@@ -455,7 +562,7 @@ class LLMManager:
         message        = self.anthropic_client.messages.create(model       = model,
                                                                max_tokens  = max_tokens,
                                                                temperature = temperature,
-                                                               system      = system_prompt or "",
                                                                messages    = [{"role": "user", "content": prompt}],
                                                               )
@@ -475,6 +582,186 @@ class LLMManager:
                           )
     # Specialized Methods
     def generate_structured_json(self, prompt: str, schema_description: str, provider: Optional[LLMProvider] = None, **kwargs) -> Dict[str, Any]:
         """
@@ -526,98 +813,6 @@ class LLMManager:
             raise ValueError(f"Failed to parse JSON response: {e}")
-    def batch_complete(self, prompts: List[str], provider: Optional[LLMProvider] = None, **kwargs) -> List[LLMResponse]:
-        """
-        Complete multiple prompts (sequential for now)
-        Arguments:
-        ----------
-            prompts   : List of prompts
-            provider  : LLM provider
-            **kwargs  : Additional arguments for complete()
-        Returns:
-        --------
-            { list }  : List of LLMResponse objects
-        """
-        log_info("Batch completion started", batch_size=len(prompts))
-        responses = list()
-        for i, prompt in enumerate(prompts):
-            log_info(f"Processing prompt {i+1}/{len(prompts)}")
-            response = self.complete(prompt   = prompt,
-                                     provider = provider,
-                                     **kwargs,
-                                    )
-            responses.append(response)
-        successful = sum(1 for r in responses if r.success)
-        log_info("Batch completion finished",
-                 total       = len(prompts),
-                 successful  = successful,
-                 failed      = len(prompts) - successful,
-                )
-        return responses
-    # OLLAMA-Specific Methods
-    def list_ollama_models(self) -> List[str]:
-        """
-        List available local Ollama models
-        """
-        try:
-            response = requests.get(f"{self.ollama_base_url}/api/tags", timeout = 30)
-            response.raise_for_status()
-            models   = [model['name'] for model in response.json().get('models', [])]
-            log_info("Ollama models listed", count = len(models), models = models)
-            return models
-        except Exception as e:
-            log_error(e, context = {"component" : "LLMManager", "operation" : "list_ollama_models"})
-            return []
-    def pull_ollama_model(self, model_name: str) -> bool:
-        """
-        Pull/download an Ollama model
-        """
-        try:
-            log_info(f"Pulling Ollama model: {model_name}")
-            response = requests.post(f"{self.ollama_base_url}/api/pull",
-                                     json    = {"name": model_name},
-                                     stream  = True,
-                                     timeout = 600,  # 10 minutes for download
-                                    )
-            response.raise_for_status()
-            # Stream response to track progress
-            for line in response.iter_lines():
-                if line:
-                    data = json.loads(line)
-                    if ('status' in data):
-                        log_info(f"Pull status: {data['status']}")
-            log_info(f"Model pulled successfully: {model_name}")
-            return True
-        except Exception as e:
-            log_error(e, context = {"component" : "LLMManager", "operation" : "pull_ollama_model", "model" : model_name})
-            return False
     # Utility Methods
     def get_provider_info(self, provider: LLMProvider) -> Dict[str, Any]:
         """
@@ -629,29 +824,32 @@ class LLMManager:
                }
         if (provider == LLMProvider.OLLAMA):
-            info["available"] = self._check_ollama_available()
             if info["available"]:
                 info["models"]   = self.list_ollama_models()
                 info["base_url"] = self.ollama_base_url
         elif (provider == LLMProvider.OPENAI):
-            info["available"] = OPENAI_AVAILABLE and bool(self.openai_api_key)
             if info["available"]:
-                info["models"] = ["gpt-3.5-turbo",
-                                  "gpt-4",
-                                  "gpt-4-turbo-preview",
-                                 ]
         elif (provider == LLMProvider.ANTHROPIC):
-            info["available"] = ANTHROPIC_AVAILABLE and bool(self.anthropic_client)
             if info["available"]:
-                info["models"] = ["claude-3-opus-20240229",
-                                  "claude-3-sonnet-20240229",
-                                  "claude-3-haiku-20240307",
-                                 ]
         return info
@@ -674,26 +872,26 @@ class LLMManager:
         --------
                 { float }     : Estimated cost in USD
         """
-        # Pricing per 1K tokens (as of 2025)
-        pricing = {"openai"    : {"gpt-3.5-turbo"       : {"prompt": 0.0015, "completion": 0.002},
-                                  "gpt-4"               : {"prompt": 0.03, "completion": 0.06},
-                                  "gpt-4-turbo-preview" : {"prompt": 0.01, "completion": 0.03},
-                                 },
-                   "anthropic" : {"claude-3-opus-20240229"   : {"prompt": 0.015, "completion": 0.075},
-                                  "claude-3-sonnet-20240229" : {"prompt": 0.003, "completion": 0.015},
-                                  "claude-3-haiku-20240307"  : {"prompt": 0.00025, "completion": 0.00125},
-                                 }
-                  }
-        if (provider == LLMProvider.OLLAMA):
-            # Local models are free
-            return 0.0
         provider_pricing = pricing.get(provider.value, {}).get(model)
         if not provider_pricing:
             return 0.0
-        cost = ((prompt_tokens / 1000) * provider_pricing["prompt"] + (completion_tokens / 1000) * provider_pricing["completion"])
-        return round(cost, 6)

 import json
 import time
 import requests
+import threading
 from enum import Enum
 from typing import Any
 from typing import Dict
 except ImportError:
     ANTHROPIC_AVAILABLE = False
+try:
+    from llama_cpp import Llama
+    LLAMA_CPP_AVAILABLE = True
+except ImportError:
+    LLAMA_CPP_AVAILABLE = False
+# Enums and models
 class LLMProvider(Enum):
     """
     Supported LLM providers
     """
+    OLLAMA     = "ollama"
+    OPENAI     = "openai"
+    ANTHROPIC  = "anthropic"
+    LLAMA_CPP  = "llama_cpp"
+    HF_INFER   = "hf_inference"
 @dataclass
 class LLMManager:
     """
+    Unified LLM manager for multiple providers : handles Ollama (local), OpenAI API, Anthropic API, and Llama.cpp
     """
+    def __init__(self, default_provider: Optional[LLMProvider] = None, ollama_base_url: Optional[str] = None,
                  openai_api_key: Optional[str] = None, anthropic_api_key: Optional[str] = None):
         """
         Initialize LLM Manager
         Arguments:
         ----------
+            default_provider  : Default LLM provider to use (if None, uses settings.LLM_DEFAULT_PROVIDER)
             ollama_base_url   : Ollama server URL (default: from settings)
             anthropic_api_key : Anthropic API key (or set ANTHROPIC_API_KEY env var)
         """
+        self.default_provider   = default_provider or LLMProvider(settings.LLM_DEFAULT_PROVIDER)
         self.logger             = ContractAnalyzerLogger.get_logger()
         # Configuration Variables Initialization
         else:
             self.anthropic_client = None
+        # Llama.cpp configuration (lazy loaded)
+        self.llama_cpp_model    = None
+        self.llama_cpp_lock     = threading.Lock()
+        # HuggingFace Inference configuration
+        self.hf_client          = None
+        if (settings.ENABLE_HF_INFERENCE and settings.HF_API_TOKEN):
+            try:
+                from huggingface_hub import InferenceClient
+                self.hf_client = InferenceClient(model = settings.HF_MODEL_ID,
+                                                 token = settings.HF_API_TOKEN,
+                                                )
+            except ImportError:
+                log_error("huggingface_hub not installed, HF Inference disabled")
         # Rate limiting (simple token bucket)
         self._rate_limit_tokens      = settings.RATE_LIMIT_REQUESTS
         self._rate_limit_last_refill = time.time()
         self._rate_limit_refill_rate = settings.RATE_LIMIT_REQUESTS / settings.RATE_LIMIT_PERIOD
+        # Generation settings from settings (not ModelConfig)
+        self.generation_config       = {"max_tokens"     : settings.LLM_MAX_TOKENS,
+                                        "temperature"    : settings.LLM_TEMPERATURE,
+                                        "top_p"          : settings.LLM_TOP_P,
+                                        "repeat_penalty" : settings.LLM_REPEAT_PENALTY,
+                                       }
         log_info("LLMManager initialized",
+                 default_provider      = self.default_provider.value,
+                 deployment_env        = settings.DEPLOYMENT_ENV,
+                 ollama_enabled        = settings.ENABLE_OLLAMA,
+                 llama_cpp_enabled     = settings.ENABLE_LLAMA_CPP,
+                 openai_available      = OPENAI_AVAILABLE and bool(self.openai_api_key),
+                 anthropic_available   = ANTHROPIC_AVAILABLE and bool(self.anthropic_api_key),
+                 llama_cpp_available   = LLAMA_CPP_AVAILABLE,
+                 provider_priority     = settings.LLM_PROVIDER_PRIORITY,
                 )
         """
         Check if Ollama server is available
         """
+        if not settings.ENABLE_OLLAMA:
+            return False
         try:
             response  = requests.get(f"{self.ollama_base_url}/api/tags", timeout = 30)
             available = (response.status_code == 200)
     def get_available_providers(self) -> List[LLMProvider]:
         """
+        Get list of available providers based on settings and environment
         """
         available = list()
+        # Check each provider based on settings
+        if (settings.ENABLE_OLLAMA and self._check_ollama_available()):
             available.append(LLMProvider.OLLAMA)
+        if (settings.ENABLE_OPENAI and OPENAI_AVAILABLE and self.openai_api_key):
             available.append(LLMProvider.OPENAI)
+        if (settings.ENABLE_ANTHROPIC and ANTHROPIC_AVAILABLE and self.anthropic_api_key):
             available.append(LLMProvider.ANTHROPIC)
+        if (settings.ENABLE_LLAMA_CPP and LLAMA_CPP_AVAILABLE):
+            available.append(LLMProvider.LLAMA_CPP)
+        if (settings.ENABLE_HF_INFERENCE and self.hf_client):
+            available.append(LLMProvider.HF_INFER)
+        # Sort by priority from settings
+        priority_order = settings.LLM_PROVIDER_PRIORITY
+        available.sort(key = lambda p: priority_order.index(p.value) if p.value in priority_order else len(priority_order))
+        log_info("Available LLM providers",
+                 providers = [p.value for p in available],
+                 priority  = priority_order,
+                )
         return available
         """
         Check if rate limit allows request (simple token bucket)
         """
+        if not settings.RATE_LIMIT_ENABLED:
+            return True
+        now                          = time.time()
+        time_passed                  = now - self._rate_limit_last_refill
         # Refill tokens
         self._rate_limit_tokens      = min(settings.RATE_LIMIT_REQUESTS, self._rate_limit_tokens + time_passed * self._rate_limit_refill_rate)
         if (self._rate_limit_tokens >= 1):
             self._rate_limit_tokens -= 1
             return True
         log_info("Rate limit hit, waiting...", tokens_remaining = self._rate_limit_tokens)
     # UNIFIED COMPLETION METHOD
     @ContractAnalyzerLogger.log_execution_time("llm_complete")
     def complete(self, prompt: str, provider: Optional[LLMProvider] = None, model: Optional[str] = None, temperature: Optional[float] = None,
+                 max_tokens: Optional[int] = None, system_prompt: Optional[str] = None, json_mode: bool = False, retry_on_error: bool = True,
+                 max_retries: int = 3) -> LLMResponse:
         """
+        Unified completion method for all providers with automatic fallback
         Arguments:
         ----------
             model              : Model name (provider-specific)
+            temperature        : Sampling temperature (0.0-1.0, default from settings)
+            max_tokens         : Maximum tokens to generate (default from settings)
             system_prompt      : System prompt (if supported)
             retry_on_error     : Retry with fallback providers on error
+            max_retries        : Maximum number of retry attempts
         Returns:
         --------
             { LLMResponse }    : LLMResponse object
         """
+        provider      = provider or self.default_provider
+        temperature   = temperature or settings.LLM_TEMPERATURE
+        max_tokens    = max_tokens or settings.LLM_MAX_TOKENS
+        system_prompt = system_prompt or settings.LLM_SYSTEM_PROMPT
         log_info("LLM completion request",
                  provider      = provider.value,
         # Rate limiting
         self._wait_for_rate_limit()
+        # Try primary provider with retries
+        for attempt in range(max_retries if retry_on_error else 1):
+            try:
+                if (provider == LLMProvider.OLLAMA):
+                    return self._complete_ollama(prompt        = prompt,
+                                                 model         = model,
+                                                 temperature   = temperature,
+                                                 max_tokens    = max_tokens,
+                                                 system_prompt = system_prompt,
+                                                 json_mode     = json_mode,
+                                                )
+                elif (provider == LLMProvider.OPENAI):
+                    return self._complete_openai(prompt        = prompt,
+                                                 model         = model,
+                                                 temperature   = temperature,
+                                                 max_tokens    = max_tokens,
+                                                 system_prompt = system_prompt,
+                                                 json_mode     = json_mode,
+                                                )
+                elif (provider == LLMProvider.ANTHROPIC):
+                    return self._complete_anthropic(prompt        = prompt,
+                                                    model         = model,
+                                                    temperature   = temperature,
+                                                    max_tokens    = max_tokens,
+                                                    system_prompt = system_prompt,
+                                                   )
+                elif (provider == LLMProvider.LLAMA_CPP):
+                    return self._complete_llama_cpp(prompt        = prompt,
+                                                    model         = model,
+                                                    temperature   = temperature,
+                                                    max_tokens    = max_tokens,
+                                                    system_prompt = system_prompt,
+                                                    json_mode     = json_mode,
+                                                   )
+                elif (provider == LLMProvider.HF_INFER):
+                    return self._complete_hf_inference(prompt        = prompt,
+                                                       model         = model,
+                                                       temperature   = temperature,
+                                                       max_tokens    = max_tokens,
+                                                       system_prompt = system_prompt,
+                                                      )
+                else:
+                    raise ValueError(f"Unsupported provider: {provider}")
+            except Exception as e:
+                log_error(e, context = {"component" : "LLMManager",
+                                        "operation" : "complete",
+                                        "provider"  : provider.value,
+                                        "attempt"   : attempt + 1,
+                                       }
+                         )
+                if (attempt < max_retries - 1):
+                    log_info(f"Retrying attempt {attempt + 2}/{max_retries}")
+                    # Exponential backoff
+                    time.sleep(1 * (attempt + 1))
+                    continue
+                # If retries exhausted, try fallback providers
+                if retry_on_error:
+                    available_providers = self.get_available_providers()
+                    # Remove current provider from fallback list
+                    fallback_providers  = [p for p in available_providers if p != provider]
+                    for fallback_provider in fallback_providers:
+                        try:
+                            log_info(f"Attempting fallback to {fallback_provider.value}")
+                            # Prevent infinite recursion by disabling further fallbacks
+                            return self.complete(prompt         = prompt,
+                                                 provider       = fallback_provider,
+                                                 model          = model,
+                                                 temperature    = temperature,
+                                                 max_tokens     = max_tokens,
+                                                 system_prompt  = system_prompt,
+                                                 json_mode      = json_mode,
+                                                 retry_on_error = False,  # No more fallbacks
+                                                )
+                        except Exception as fallback_error:
+                            log_error(fallback_error, context = {"component" : "LLMManager",
+                                                                 "operation" : "fallback_complete",
+                                                                 "provider"  : fallback_provider.value,
+                                                                }
+                                     )
+                            continue
+                # All attempts failed
+                return LLMResponse(text            = "",
+                                   provider        = provider.value,
+                                   model           = model or "unknown",
+                                   tokens_used     = 0,
+                                   latency_seconds = 0.0,
+                                   success         = False,
+                                   error_message   = str(e),
+                                  )
+        # Should never reach here
+        return LLMResponse(text            = "",
+                           provider        = provider.value,
+                           model           = model or "unknown",
+                           tokens_used     = 0,
+                           latency_seconds = 0.0,
+                           success         = False,
+                           error_message   = "Unknown error",
+                          )
     # OLLAMA Provider
         """
         Complete using local Ollama
         """
+        if not settings.ENABLE_OLLAMA:
+            raise ValueError("Ollama is disabled in settings")
         start_time  = time.time()
         model       = model or self.ollama_model
                  json_mode = json_mode,
                 )
+        response       = requests.post(f"{self.ollama_base_url}/api/generate",
+                                       json    = payload,
+                                       timeout = self.ollama_timeout,
+                                      )
         response.raise_for_status()
         result         = response.json()
         """
         Complete using OpenAI API
         """
+        if not settings.ENABLE_OPENAI:
+            raise ValueError("OpenAI is disabled in settings")
         if not OPENAI_AVAILABLE or not self.openai_api_key:
             raise ValueError("OpenAI not available. Install with: pip install openai")
         start_time = time.time()
+        model      = model or settings.OPENAI_MODEL
         # Construct messages
         messages   = list()
         """
         Complete using Anthropic (Claude) API
         """
+        if not settings.ENABLE_ANTHROPIC:
+            raise ValueError("Anthropic is disabled in settings")
         if not ANTHROPIC_AVAILABLE or not self.anthropic_client:
             raise ValueError("Anthropic not available. Install with: pip install anthropic")
+        start_time     = time.time()
+        model          = model or settings.ANTHROPIC_MODEL
         log_info("Calling Anthropic API", model = model)
         message        = self.anthropic_client.messages.create(model       = model,
                                                                max_tokens  = max_tokens,
                                                                temperature = temperature,
+                                                               system      = system_prompt or settings.LLM_SYSTEM_PROMPT,
                                                                messages    = [{"role": "user", "content": prompt}],
                                                               )
                           )
+    # Llama.cpp Provider
+    def _complete_llama_cpp(self, prompt: str, model: Optional[str], temperature: float, max_tokens: int, system_prompt: Optional[str], json_mode: bool) -> LLMResponse:
+        """
+        Complete using Llama.cpp (GGUF models)
+        """
+        if not settings.ENABLE_LLAMA_CPP:
+            raise ValueError("Llama.cpp is disabled in settings")
+        if not LLAMA_CPP_AVAILABLE:
+            raise ValueError("llama-cpp-python not installed. Install with: pip install llama-cpp-python")
+        start_time    = time.time()
+        # Lazy load the model
+        with self.llama_cpp_lock:
+            if self.llama_cpp_model is None:
+                self._load_llama_cpp_model()
+        # Construct full prompt
+        system_prompt  = system_prompt or settings.LLM_SYSTEM_PROMPT
+        full_prompt    = f"""
+                            {system_prompt}
+                            {prompt}
+                            Response:
+                         """
+        log_info("Calling Llama.cpp",
+                 model_path = str(settings.LLAMA_CPP_MODEL_PATH),
+                 n_ctx      = settings.LLAMA_CPP_N_CTX,
+                 json_mode  = json_mode,
+                )
+        # Generate response
+        response       = self.llama_cpp_model(prompt         = full_prompt,
+                                              max_tokens     = max_tokens,
+                                              temperature    = temperature,
+                                              top_p          = settings.LLM_TOP_P,
+                                              repeat_penalty = settings.LLM_REPEAT_PENALTY,
+                                              stop           = ["\n\n", "###", "Human:", "Assistant:", "</s>"],
+                                              echo           = False,
+                                             )
+        generated_text = response['choices'][0]['text'].strip()
+        latency        = time.time() - start_time
+        # Rough token estimation
+        tokens_used    = len(full_prompt.split()) + len(generated_text.split())
+        log_info("Llama.cpp completion successful",
+                 tokens_used     = tokens_used,
+                 latency_seconds = round(latency, 3),
+                )
+        return LLMResponse(text            = generated_text,
+                           provider        = "llama_cpp",
+                           model           = str(settings.LLAMA_CPP_MODEL_PATH),
+                           tokens_used     = tokens_used,
+                           latency_seconds = latency,
+                           success         = True,
+                           raw_response    = response,
+                          )
+    def _load_llama_cpp_model(self):
+        """
+        Lazy load the Llama.cpp model
+        """
+        log_info("Loading Llama.cpp model", model_path=str(settings.LLAMA_CPP_MODEL_PATH))
+        # Ensure model exists, download if needed
+        if( not settings.LLAMA_CPP_MODEL_PATH.exists()):
+            self._download_llama_cpp_model()
+        # Load model with appropriate GPU layers / CPU loading
+        n_gpu_layers         = settings.LLAMA_CPP_N_GPU_LAYERS
+        if settings.IS_HUGGINGFACE_SPACE:
+            n_gpu_layers = 0
+        self.llama_cpp_model = Llama(model_path   = str(settings.LLAMA_CPP_MODEL_PATH),
+                                     n_ctx        = settings.LLAMA_CPP_N_CTX,
+                                     n_gpu_layers = n_gpu_layers,
+                                     n_batch      = settings.LLAMA_CPP_N_BATCH,
+                                     n_threads    = settings.LLAMA_CPP_N_THREADS,
+                                     verbose      = False,
+                                    )
+        log_info("Llama.cpp model loaded successfully")
+    def _download_llama_cpp_model(self):
+        """
+        Download GGUF model from HuggingFace Hub
+        """
+        log_info("Downloading GGUF model", repo = settings.LLAMA_CPP_MODEL_REPO, filename = settings.LLAMA_CPP_MODEL_FILE)
+        try:
+            from huggingface_hub import hf_hub_download
+            # Ensure cache directory exists
+            settings.MODEL_CACHE_DIR.mkdir(parents = True, exist_ok = True)
+            # Download the model
+            downloaded_path = hf_hub_download(repo_id         = settings.LLAMA_CPP_MODEL_REPO,
+                                              filename        = settings.LLAMA_CPP_MODEL_FILE,
+                                              cache_dir       = str(settings.MODEL_CACHE_DIR),
+                                              force_download  = False,
+                                              resume_download = True,
+                                             )
+            # Create symlink to expected path
+            if (downloaded_path != str(settings.LLAMA_CPP_MODEL_PATH)):
+                import shutil
+                shutil.copy(downloaded_path, settings.LLAMA_CPP_MODEL_PATH)
+            log_info("GGUF model downloaded successfully", path = str(settings.LLAMA_CPP_MODEL_PATH))
+        except Exception as e:
+            log_error(e, context = {"component" : "LLMManager",
+                                    "operation" : "download_llama_cpp_model",
+                                    "repo"      : settings.LLAMA_CPP_MODEL_REPO,
+                                    "filename"  : settings.LLAMA_CPP_MODEL_FILE,
+                                   }
+                     )
+            raise
+    # HuggingFace Inference Provider
+    def _complete_hf_inference(self, prompt: str, model: Optional[str], temperature: float, max_tokens: int, system_prompt: Optional[str]) -> LLMResponse:
+        """
+        Complete using HuggingFace Inference API
+        """
+        if not settings.ENABLE_HF_INFERENCE or not self.hf_client:
+            raise ValueError("HF Inference is disabled or not configured")
+        start_time     = time.time()
+        # Construct full prompt
+        full_prompt    = f"""
+                             {system_prompt or settings.LLM_SYSTEM_PROMPT}
+                             {prompt}
+                             Response:
+                          """
+        log_info("Calling HuggingFace Inference API")
+        # Generate response
+        response       = self.hf_client.text_generation(full_prompt,
+                                                        max_new_tokens   = max_tokens,
+                                                        temperature      = temperature,
+                                                        do_sample        = True,
+                                                        return_full_text = False,
+                                                       )
+        generated_text = response
+        latency        = time.time() - start_time
+        # Rough token estimation
+        tokens_used    = len(full_prompt.split()) + len(generated_text.split())
+        log_info("HF Inference completion successful",
+                 tokens_used     = tokens_used,
+                 latency_seconds = round(latency, 3),
+                )
+        return LLMResponse(text            = generated_text,
+                           provider        = "hf_inference",
+                           model           = settings.HF_MODEL_ID or "hf_inference",
+                           tokens_used     = tokens_used,
+                           latency_seconds = latency,
+                           success         = True,
+                           raw_response    = {"text": generated_text},
+                          )
     # Specialized Methods
     def generate_structured_json(self, prompt: str, schema_description: str, provider: Optional[LLMProvider] = None, **kwargs) -> Dict[str, Any]:
         """
             raise ValueError(f"Failed to parse JSON response: {e}")
     # Utility Methods
     def get_provider_info(self, provider: LLMProvider) -> Dict[str, Any]:
         """
                }
         if (provider == LLMProvider.OLLAMA):
+            info["available"] = settings.ENABLE_OLLAMA and self._check_ollama_available()
             if info["available"]:
                 info["models"]   = self.list_ollama_models()
                 info["base_url"] = self.ollama_base_url
         elif (provider == LLMProvider.OPENAI):
+            info["available"] = settings.ENABLE_OPENAI and OPENAI_AVAILABLE and bool(self.openai_api_key)
             if info["available"]:
+                info["models"] = [settings.OPENAI_MODEL, "gpt-4", "gpt-4-turbo-preview"]
         elif (provider == LLMProvider.ANTHROPIC):
+            info["available"] = settings.ENABLE_ANTHROPIC and ANTHROPIC_AVAILABLE and bool(self.anthropic_client)
             if info["available"]:
+                info["models"] = [settings.ANTHROPIC_MODEL, "claude-3-sonnet-20240229", "claude-3-opus-20240229"]
+        elif (provider == LLMProvider.LLAMA_CPP):
+            info["available"]  = settings.ENABLE_LLAMA_CPP and LLAMA_CPP_AVAILABLE
+            info["model_path"] = str(settings.LLAMA_CPP_MODEL_PATH) if settings.LLAMA_CPP_MODEL_PATH else None
+            info["model_repo"] = settings.LLAMA_CPP_MODEL_REPO
+        elif (provider == LLMProvider.HF_INFER):
+            info["available"] = settings.ENABLE_HF_INFERENCE and self.hf_client is not None
+            info["model_id"]  = settings.HF_MODEL_ID
         return info
         --------
                 { float }     : Estimated cost in USD
         """
+        # Local models (Ollama, Llama.cpp) are free
+        if provider in [LLMProvider.OLLAMA, LLMProvider.LLAMA_CPP, LLMProvider.HF_INFER]:
+            return 0.0
+        # Pricing per 1K tokens (as of 2025)
+        pricing          = {"openai"    : {"gpt-3.5-turbo"       : {"prompt": 0.0015, "completion": 0.002},
+                                           "gpt-4"               : {"prompt": 0.03, "completion": 0.06},
+                                           "gpt-4-turbo-preview" : {"prompt": 0.01, "completion": 0.03},
+                                          },
+                            "anthropic" : {"claude-3-opus-20240229"   : {"prompt": 0.015, "completion": 0.075},
+                                           "claude-3-sonnet-20240229" : {"prompt": 0.003, "completion": 0.015},
+                                           "claude-3-haiku-20240307"  : {"prompt": 0.00025, "completion": 0.00125},
+                                          }
+                           }
         provider_pricing = pricing.get(provider.value, {}).get(model)
         if not provider_pricing:
             return 0.0
+        cost             = ((prompt_tokens / 1000) * provider_pricing["prompt"] + (completion_tokens / 1000) * provider_pricing["completion"])
+        return round(cost, 6)

requirements.txt CHANGED Viewed

@@ -37,6 +37,10 @@ requests>=2.31.0
 openai>=1.0.0
 anthropic>=0.5.0
 # Text Processing Utilities
 chardet>=5.0.0
 langdetect>=1.0.9
@@ -56,4 +60,7 @@ psutil>=5.9.5
 orjson>=3.9.0
 # For spaCy performance
-blis>=0.7.10

 openai>=1.0.0
 anthropic>=0.5.0
+# For Huggingface Spaces
+llama-cpp-python>=0.2.20  # For CPU-only GGUF models on HF Spaces
+huggingface-hub>=0.19.0   # For downloading GGUF models
 # Text Processing Utilities
 chardet>=5.0.0
 langdetect>=1.0.9
 orjson>=3.9.0
 # For spaCy performance
+blis>=0.7.10
+# Additional for spaCy model
+spacy-transformers>=1.2.0

services/llm_interpreter.py CHANGED Viewed

@@ -36,6 +36,7 @@ class LLMClauseInterpreter:
         Arguments:
         ----------
             llm_manager      { LLMManager }  : LLMManager instance
             default_provider { LLMProvider } : Default LLM provider to use
         """
         self.llm_manager      = llm_manager
@@ -225,7 +226,7 @@ class LLMClauseInterpreter:
                                                                              provider           = provider,
                                                                              temperature        = 0.3,
                                                                              max_tokens         = 1200,
-                                                                             fallback_providers = [LLMProvider.OPENAI, LLMProvider.ANTHROPIC],
                                                                             )
             # Calculate negotiation priority

         Arguments:
         ----------
             llm_manager      { LLMManager }  : LLMManager instance
             default_provider { LLMProvider } : Default LLM provider to use
         """
         self.llm_manager      = llm_manager
                                                                              provider           = provider,
                                                                              temperature        = 0.3,
                                                                              max_tokens         = 1200,
+                                                                             fallback_providers = [LLMProvider.LLAMA_CPP, LLMProvider.OPENAI, LLMProvider.ANTHROPIC],
                                                                             )
             # Calculate negotiation priority

services/negotiation_engine.py CHANGED Viewed

@@ -503,7 +503,7 @@ class NegotiationEngine:
                                                  provider           = provider,
                                                  temperature        = 0.3,
                                                  max_tokens         = 2000,
-                                                 fallback_providers = [LLMProvider.OPENAI],
                                                  retry_on_error     = True,
                                                 )
             if response.success:

                                                  provider           = provider,
                                                  temperature        = 0.3,
                                                  max_tokens         = 2000,
+                                                 fallback_providers = [ LLMProvider.LLAMA_CPP, LLMProvider.OPENAI, LLMProvider.ANTHROPIC],
                                                  retry_on_error     = True,
                                                 )
             if response.success:

services/summary_generator.py CHANGED Viewed

@@ -23,15 +23,23 @@ class SummaryGenerator:
     """
     LLM-powered executive summary generator for contract analysis : Generates professional, detailed executive summaries using ALL pipeline outputs
     """
-    def __init__(self, llm_manager: Optional[LLMManager] = None):
         """
         Initialize the summary generator
         Arguments:
         ----------
-            llm_manager { LLMManager } : LLM manager instance (if None, creates one with default settings)
         """
-        self.llm_manager = llm_manager or LLMManager()
         self.logger      = ContractAnalyzerLogger.get_logger()
         self.logger.info("Summary generator initialized")
@@ -39,7 +47,8 @@ class SummaryGenerator:
     # Main entry point with full pipeline integration
     def generate_executive_summary(self, contract_text: str, classification: ContractCategory, risk_analysis: RiskScore, risk_interpretation: RiskInterpretation,
-                                   negotiation_playbook: NegotiationPlaybook, unfavorable_terms: List, missing_protections: List, clauses: List) -> str:
         """
         Generate executive summary using all the pipeline outputs
@@ -60,6 +69,8 @@ class SummaryGenerator:
             missing_protections          { List }        : Missing protections
             clauses                      { List }        : Extracted clauses
         Returns:
         --------
@@ -78,7 +89,9 @@ class SummaryGenerator:
                                                    )
             # Generate summary using LLM
-            summary = self._generate_summary(context = context)
             self.logger.info(f"Executive summary generated - Risk: {context.risk_score}/100 ({context.risk_level})")
@@ -193,7 +206,7 @@ class SummaryGenerator:
         return findings
-    def _generate_summary(self, context: SummaryContext) -> str:
         """
         Generate enhanced summary using comprehensive context
         """
@@ -203,6 +216,7 @@ class SummaryGenerator:
         try:
             response = self.llm_manager.complete(prompt        = prompt,
                                                  system_prompt = system_prompt,
                                                  temperature   = 0.3,
                                                  max_tokens    = 500,
                                                  json_mode     = False,

     """
     LLM-powered executive summary generator for contract analysis : Generates professional, detailed executive summaries using ALL pipeline outputs
     """
+    def __init__(self, llm_manager: Optional[LLMManager] = None, default_provider: Optional[LLMProvider] = None):
         """
         Initialize the summary generator
         Arguments:
         ----------
+            llm_manager       { LLMManager }  : LLM manager instance (if None, creates one with default settings)
+            default_provider  { LLMProvider } : Default LLM provider to use if creating new LLMManager
         """
+        # Create LLMManager with the specified provider (or use default from settings)
+        if llm_manager is None:
+            self.llm_manager = LLMManager(default_provider = default_provider)
+        else:
+            self.llm_manager = llm_manager
         self.logger      = ContractAnalyzerLogger.get_logger()
         self.logger.info("Summary generator initialized")
     # Main entry point with full pipeline integration
     def generate_executive_summary(self, contract_text: str, classification: ContractCategory, risk_analysis: RiskScore, risk_interpretation: RiskInterpretation,
+                                   negotiation_playbook: NegotiationPlaybook, unfavorable_terms: List, missing_protections: List, clauses: List,
+                                   provider: Optional[LLMProvider] = None) -> str:
         """
         Generate executive summary using all the pipeline outputs
             missing_protections          { List }        : Missing protections
             clauses                      { List }        : Extracted clauses
+            provider                 { LLMProvide }      : Optional LLM provider override
         Returns:
         --------
                                                    )
             # Generate summary using LLM
+            summary = self._generate_summary(context  = context,
+                                             provider = provider,
+                                            )
             self.logger.info(f"Executive summary generated - Risk: {context.risk_score}/100 ({context.risk_level})")
         return findings
+    def _generate_summary(self, context: SummaryContext, provider: Optional[LLMProvider] = None) -> str:
         """
         Generate enhanced summary using comprehensive context
         """
         try:
             response = self.llm_manager.complete(prompt        = prompt,
                                                  system_prompt = system_prompt,
+                                                 provider      = provider,
                                                  temperature   = 0.3,
                                                  max_tokens    = 500,
                                                  json_mode     = False,

utils/document_reader.py CHANGED Viewed

@@ -9,10 +9,6 @@ from typing import Union
 from pathlib import Path
 from docx import Document
 from typing import Optional
-# Add parent directory to path for imports
-#sys.path.append(str(Path(__file__).parent.parent))
 from config.settings import settings

 from pathlib import Path
 from docx import Document
 from typing import Optional
 from config.settings import settings