satyaki-mitra commited on
Commit
1ee1cb7
·
1 Parent(s): f7bf809

llama-cpp support and hf space integration

Browse files
.env.huggingface ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================
2
+ # HUGGINGFACE SPACES CONFIGURATION (FREE TIER)
3
+ # ============================================
4
+
5
+ # Environment Detection
6
+ IS_HUGGINGFACE_SPACE=true
7
+ DEPLOYMENT_ENV=huggingface
8
+
9
+ # ============================================
10
+ # LLM PROVIDER CONFIGURATION
11
+ # ============================================
12
+
13
+ # Provider Priority (explicit for HF Spaces)
14
+ LLM_PROVIDER_PRIORITY=llama_cpp,openai,anthropic,hf_inference
15
+ LLM_DEFAULT_PROVIDER=llama_cpp
16
+
17
+ # Provider Availability
18
+ ENABLE_OLLAMA=false # Ollama not available on HF Spaces
19
+ ENABLE_LLAMA_CPP=true # Primary: llama.cpp with GGUF models
20
+ ENABLE_OPENAI=false # Disabled unless you add API key
21
+ ENABLE_ANTHROPIC=false # Disabled unless you add API key
22
+ ENABLE_HF_INFERENCE=false # Disabled unless you enable below
23
+
24
+ # ============================================
25
+ # LLAMA.CPP CONFIGURATION (PRIMARY PROVIDER)
26
+ # ============================================
27
+
28
+ # Model Selection (Hermes-2-Pro-Llama-3-8B is excellent for legal analysis)
29
+ LLAMA_CPP_MODEL_REPO=NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF
30
+ LLAMA_CPP_MODEL_FILE=Hermes-2-Pro-Llama-3-8B-GGUF.Q4_K_M.gguf
31
+
32
+ # CPU-Only Configuration (CRITICAL for free tier)
33
+ LLAMA_CPP_N_GPU_LAYERS=0 # 0 = CPU only
34
+ LLAMA_CPP_N_CTX=4096 # Context window
35
+ LLAMA_CPP_N_BATCH=128 # Smaller batches for CPU memory
36
+ LLAMA_CPP_N_THREADS=4 # CPU threads (optimize for free tier)
37
+
38
+ # ============================================
39
+ # LLM GENERATION SETTINGS
40
+ # ============================================
41
+
42
+ # Generation Parameters
43
+ LLM_TEMPERATURE=0.1 # Low temperature for consistent legal analysis
44
+ LLM_MAX_TOKENS=1024 # Max tokens per response
45
+ LLM_TOP_P=0.95 # Top-p sampling
46
+ LLM_REPEAT_PENALTY=1.1 # Repeat penalty
47
+
48
+ # System Prompt (optimized for legal analysis)
49
+ LLM_SYSTEM_PROMPT="You are a specialized legal contract analyst. Provide concise, accurate analysis focusing on risk identification, clause interpretation, and practical recommendations."
50
+
51
+ # ============================================
52
+ # EXTERNAL API FALLBACKS (OPTIONAL)
53
+ # ============================================
54
+
55
+ # OpenAI API (optional fallback - add your key in Space Secrets)
56
+ # ENABLE_OPENAI=true
57
+ # OPENAI_API_KEY=sk-xxxxxxx
58
+ # OPENAI_MODEL=gpt-3.5-turbo
59
+ # OPENAI_TIMEOUT=30
60
+ # OPENAI_MAX_TOKENS=1024
61
+
62
+ # Anthropic API (optional fallback)
63
+ # ENABLE_ANTHROPIC=true
64
+ # ANTHROPIC_API_KEY=sk-ant-xxxxxxx
65
+ # ANTHROPIC_MODEL=claude-3-haiku-20240307
66
+ # ANTHROPIC_TIMEOUT=30
67
+
68
+ # HuggingFace Inference API (optional - uses HF token from environment)
69
+ # ENABLE_HF_INFERENCE=true
70
+ # HF_MODEL_ID=meta-llama/Llama-2-7b-chat-hf
71
+ # HF_API_TOKEN=${HF_TOKEN} # Automatically provided by HF Spaces
72
+
73
+ # ============================================
74
+ # APPLICATION SETTINGS
75
+ # ============================================
76
+
77
+ # File Upload Limits
78
+ MAX_UPLOAD_SIZE=10485760 # 10MB (free tier memory consideration)
79
+ ALLOWED_EXTENSIONS=.pdf,.docx,.txt
80
+
81
+ # Contract Analysis Limits
82
+ MIN_CONTRACT_LENGTH=300 # Minimum characters
83
+ MAX_CONTRACT_LENGTH=500000 # Maximum characters (500KB)
84
+
85
+ # Performance Settings
86
+ MODEL_CACHE_SIZE=2 # Cache 2 models in memory (free tier limit)
87
+ USE_GPU=false # Force CPU-only for free tier
88
+
89
+ # Logging
90
+ LOG_LEVEL=INFO
91
+ LOG_FILE=/tmp/app.log # Use tmp for ephemeral storage
92
+
93
+ # Cache Settings
94
+ ENABLE_CACHE=true
95
+ CACHE_TTL=3600 # 1 hour cache
96
+ CACHE_DIR=/tmp/cache # Use tmp for ephemeral storage
97
+
98
+ # Model Cache Directory (HF Spaces uses /data for persistence)
99
+ MODEL_CACHE_DIR=/data/models # CRITICAL: HF Spaces persists /data
100
+
101
+ # Rate Limiting (important for free tier)
102
+ RATE_LIMIT_ENABLED=true
103
+ RATE_LIMIT_REQUESTS=5 # Reduced for free tier
104
+ RATE_LIMIT_PERIOD=60 # Per minute
105
+
106
+ # ============================================
107
+ # SERVER CONFIGURATION
108
+ # ============================================
109
+
110
+ # Server Settings (HF Spaces uses port 7860)
111
+ HOST=0.0.0.0
112
+ PORT=7860 # HF Spaces default port
113
+ WORKERS=1 # Single worker for free tier
114
+ RELOAD=false # Disable reload in production
115
+
116
+ # CORS (configure for your frontend)
117
+ CORS_ORIGINS=["https://*.hf.space", "http://localhost:3000"]
118
+ CORS_ALLOW_CREDENTIALS=true
119
+ CORS_ALLOW_METHODS=["*"]
120
+ CORS_ALLOW_HEADERS=["*"]
121
+
122
+ # ============================================
123
+ # PDF REPORT SETTINGS
124
+ # ============================================
125
+
126
+ PDF_FONT_SIZE=10
127
+ PDF_MARGIN=0.5
128
+ PDF_PAGE_SIZE=letter
Dockerfile CHANGED
@@ -2,30 +2,50 @@ FROM python:3.11-slim
2
 
3
  WORKDIR /app
4
 
5
- # Install system dependencies
6
  RUN apt-get update && apt-get install -y \
7
  curl \
8
  wget \
 
 
 
 
 
 
 
 
 
 
 
 
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
- # Copy requirements and install
12
  COPY requirements.txt .
13
- RUN pip install --no-cache-dir -r requirements.txt
 
14
 
15
  # Download spaCy model
16
  RUN python -m spacy download en_core_web_sm
17
 
18
- # Install Ollama
19
- RUN curl -fsSL https://ollama.ai/install.sh | sh
20
-
21
  # Copy application
22
  COPY . .
23
 
24
- # Create directories
25
- RUN mkdir -p uploads cache logs
26
 
27
- # Expose port
28
  EXPOSE 7860
29
 
30
- # Simple CMD - start Ollama in background, then start FastAPI
31
- CMD ollama serve & sleep 20 && ollama pull llama3:8b & uvicorn app:app --host 0.0.0.0 --port 7860
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  WORKDIR /app
4
 
5
+ # Install system dependencies for llama-cpp-python and PDF processing
6
  RUN apt-get update && apt-get install -y \
7
  curl \
8
  wget \
9
+ git \
10
+ build-essential \
11
+ cmake \
12
+ pkg-config \
13
+ libopenblas-dev \
14
+ liblapack-dev \
15
+ libxml2-dev \
16
+ libxslt1-dev \
17
+ zlib1g-dev \
18
+ libjpeg-dev \
19
+ libpng-dev \
20
+ libfreetype6-dev \
21
  && rm -rf /var/lib/apt/lists/*
22
 
23
+ # Copy requirements and install with optimizations
24
  COPY requirements.txt .
25
+ RUN pip install --no-cache-dir --upgrade pip && \
26
+ pip install --no-cache-dir -r requirements.txt
27
 
28
  # Download spaCy model
29
  RUN python -m spacy download en_core_web_sm
30
 
 
 
 
31
  # Copy application
32
  COPY . .
33
 
34
+ # Create directories (HF Spaces uses /data for persistent storage)
35
+ RUN mkdir -p uploads cache logs /data/models
36
 
37
+ # Expose port (HF Spaces uses 7860 by default)
38
  EXPOSE 7860
39
 
40
+ # Environment variables for CPU-only operation
41
+ ENV LLAMA_CPP_N_GPU_LAYERS=0
42
+ ENV CUDA_VISIBLE_DEVICES="" # Disable CUDA for free tier
43
+ ENV OMP_NUM_THREADS=4 # Optimize for CPU
44
+ ENV NUMEXPR_MAX_THREADS=4
45
+
46
+ # HEALTH CHECK for HF Spaces
47
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
48
+ CMD curl -f http://localhost:7860/api/v1/health || exit 1
49
+
50
+ # CMD for HuggingFace Spaces (NO Ollama!)
51
+ CMD uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1 --timeout-keep-alive 30
README.md CHANGED
@@ -22,12 +22,21 @@ license: mit
22
  [![Legal-BERT](https://img.shields.io/badge/Legal--BERT-nlpaueb/legal--bert--base--uncased-orange)](https://huggingface.co/nlpaueb/legal-bert-base-uncased)
23
  [![Sentence-BERT](https://img.shields.io/badge/Sentence--BERT-all--MiniLM--L6--v2-lightgrey)](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
24
  [![Ollama](https://img.shields.io/badge/Ollama-llama3:8b-7c3aed)](https://ollama.ai/)
 
25
  [![Docker](https://img.shields.io/badge/Docker-Ready-2496ed)](https://docker.com/)
26
  [![spaCy](https://img.shields.io/badge/spaCy-3.7+-09a3d5)](https://spacy.io/)
27
 
28
  > **Democratizing Legal Intelligence Through AI**
29
  > Comprehensive contract risk analysis using an integrated pipeline with Legal-BERT, multi-model NLP, and LLM interpretation
30
 
 
 
 
 
 
 
 
 
31
  </div>
32
 
33
  ## 🎯 Overview
@@ -39,8 +48,8 @@ The AI Contract Risk Analyzer is a production-grade legal document analysis plat
39
  - 📄 **Multi-Format Support**: PDF, DOCX, TXT document processing
40
  - 🔍 **9 Contract Categories**: Employment, NDA, Lease, Service agreements, etc.
41
  - ⚡ **Sub-60s Analysis**: Real-time risk scoring and clause extraction via pre-loaded models
42
- - 🔒 **Privacy-First**: Ephemeral processing, zero data retention
43
- - 🌐 **LLM Integration**: Ollama (local), OpenAI, Anthropic support with fallback
44
  - 📊 **Comprehensive Reports**: Executive summaries, negotiation playbooks, market comparisons, and downloadable PDFs
45
  - 🔄 **Integrated Pipeline**: A single orchestrator (`PreloadedAnalysisService`) ensures consistent context propagation from classification through to final reporting
46
 
@@ -108,7 +117,8 @@ This diagram illustrates the core components and their interactions, highlightin
108
  │ └─────────────────────────────────────────────────────┘ │
109
  │ ┌─────────────────────────────────────────────────────┐ │
110
  │ │ LLM Manager (Multi-Provider) │ │
111
- │ │ - Ollama (Local, Free)
 
112
  │ │ - OpenAI (GPT-3.5/4) │ │
113
  │ │ - Anthropic (Claude) │ │
114
  │ │ - Auto-Fallback & Rate Limiting │ │
@@ -243,7 +253,7 @@ graph LR
243
 
244
  ---
245
 
246
- ## 🚀 Installation
247
 
248
  ### Prerequisites
249
 
@@ -255,6 +265,38 @@ Storage: 10GB for models
255
  GPU: Optional (3x speedup with NVIDIA GPU + CUDA 11.8+)
256
  ```
257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  ### Quick Install
259
 
260
  ```bash
@@ -333,6 +375,16 @@ python app.py
333
  uvicorn app:app --reload --host 0.0.0.0 --port 8000
334
  ```
335
 
 
 
 
 
 
 
 
 
 
 
336
  ---
337
 
338
  ## 🔧 Technical Details
@@ -346,10 +398,17 @@ Legal-BERT: nlpaueb/legal-bert-base-uncased # 110M params, 768-dim
346
  Sentence-BERT: all-MiniLM-L6-v2 # 22M params, 384-dim
347
 
348
  # LLM Integration
349
- Ollama: llama3:8b (local, free)
 
350
  OpenAI: gpt-3.5-turbo, gpt-4
351
  Anthropic: claude-3-sonnet, claude-3-opus
352
 
 
 
 
 
 
 
353
  # Deep Learning Framework
354
  PyTorch: 2.1+
355
  Transformers: 4.35+ (Hugging Face)
@@ -494,8 +553,43 @@ Sentence-BERT Model: ~100MB
494
  LLM Manager: ~50MB
495
  Total (Idle): ~600MB
496
  Total (Peak): ~1.2GB
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
497
  ```
498
 
 
 
 
 
 
 
 
 
 
 
 
499
  ---
500
 
501
  ## 📝 License
 
22
  [![Legal-BERT](https://img.shields.io/badge/Legal--BERT-nlpaueb/legal--bert--base--uncased-orange)](https://huggingface.co/nlpaueb/legal-bert-base-uncased)
23
  [![Sentence-BERT](https://img.shields.io/badge/Sentence--BERT-all--MiniLM--L6--v2-lightgrey)](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
24
  [![Ollama](https://img.shields.io/badge/Ollama-llama3:8b-7c3aed)](https://ollama.ai/)
25
+ [![Llama.cpp](https://img.shields.io/badge/Llama.cpp-GGUF_Models-4B5563)](https://github.com/ggerganov/llama.cpp)
26
  [![Docker](https://img.shields.io/badge/Docker-Ready-2496ed)](https://docker.com/)
27
  [![spaCy](https://img.shields.io/badge/spaCy-3.7+-09a3d5)](https://spacy.io/)
28
 
29
  > **Democratizing Legal Intelligence Through AI**
30
  > Comprehensive contract risk analysis using an integrated pipeline with Legal-BERT, multi-model NLP, and LLM interpretation
31
 
32
+ > **⚠️ Important Disclaimer**: This tool provides AI-assisted contract analysis and is not a substitute for professional legal advice. Always consult a qualified attorney for legal matters. The AI may produce inaccurate or incomplete analyses.
33
+
34
+
35
+ > **🔐 Data Privacy**: Choose your deployment carefully:
36
+ > - **Local deployment** (Ollama/Llama.cpp) = Maximum privacy
37
+ > - **Cloud deployment** = Files processed on external servers
38
+ > - **API providers** (OpenAI/Anthropic) = Contract text sent to third parties
39
+
40
  </div>
41
 
42
  ## 🎯 Overview
 
48
  - 📄 **Multi-Format Support**: PDF, DOCX, TXT document processing
49
  - 🔍 **9 Contract Categories**: Employment, NDA, Lease, Service agreements, etc.
50
  - ⚡ **Sub-60s Analysis**: Real-time risk scoring and clause extraction via pre-loaded models
51
+ - 🛡️ **Privacy-Flexible**: Choose between 100% local (Ollama), local models on cloud (llama.cpp), or external APIs
52
+ - 🌐 **Multi-Provider LLM**: Ollama (100% local), llama.cpp (local GGUF models), OpenAI, Anthropic with fallback
53
  - 📊 **Comprehensive Reports**: Executive summaries, negotiation playbooks, market comparisons, and downloadable PDFs
54
  - 🔄 **Integrated Pipeline**: A single orchestrator (`PreloadedAnalysisService`) ensures consistent context propagation from classification through to final reporting
55
 
 
117
  │ └─────────────────────────────────────────────────────┘ │
118
  │ ┌─────────────────────────────────────────────────────┐ │
119
  │ │ LLM Manager (Multi-Provider) │ │
120
+ │ │ - Ollama (Local, Free) | |
121
+ | | - Llama.cpp (GGUF Models, CPU/GPU) │ │
122
  │ │ - OpenAI (GPT-3.5/4) │ │
123
  │ │ - Anthropic (Claude) │ │
124
  │ │ - Auto-Fallback & Rate Limiting │ │
 
253
 
254
  ---
255
 
256
+ ## 🚀 Installation Options
257
 
258
  ### Prerequisites
259
 
 
265
  GPU: Optional (3x speedup with NVIDIA GPU + CUDA 11.8+)
266
  ```
267
 
268
+
269
+ ### Installation Options
270
+
271
+ Choose based on your privacy and hardware requirements:
272
+
273
+ #### 🔒 Option A: Maximum Privacy (Local Ollama)
274
+ ```bash
275
+ # For complete local processing
276
+ pip install -r requirements.txt
277
+ ollama serve
278
+ ollama pull llama3:8b
279
+ ```
280
+ #### 💻 Option B: Good Privacy + CPU Support (Local Llama.cpp)
281
+ ```bash
282
+ # For systems without GPU or Ollama
283
+ pip install llama-cpp-python huggingface-hub
284
+ # Models downloaded automatically on first run
285
+ ```
286
+
287
+ #### ☁️ Option C: Free Cloud (HuggingFace Spaces)
288
+ ```bash
289
+ # No installation needed
290
+ # Visit: https://huggingface.co/spaces/[your-space]
291
+ # Models automatically downloaded, runs on HF infrastructure
292
+ ```
293
+
294
+ #### 🌐 Option D: External APIs (Best Quality)
295
+ ```bash
296
+ # Add API keys to .env for OpenAI/Anthropic
297
+ # Models run on external servers
298
+ ```
299
+
300
  ### Quick Install
301
 
302
  ```bash
 
375
  uvicorn app:app --reload --host 0.0.0.0 --port 8000
376
  ```
377
 
378
+
379
+ ### Deployment Options Summary
380
+
381
+ | Option | Privacy | Setup | Best For |
382
+ |--------|---------|-------|----------|
383
+ | **Local Ollama** | 🔒 Maximum | Medium | Sensitive contracts |
384
+ | **Local Llama.cpp** | 🔒 High | Easy | General use, CPU-only |
385
+ | **HF Spaces** | 🟡 Medium | Trivial | Demos, testing |
386
+ | **External APIs** | 🟡 Medium | Easy | Non-sensitive, best quality |
387
+
388
  ---
389
 
390
  ## 🔧 Technical Details
 
398
  Sentence-BERT: all-MiniLM-L6-v2 # 22M params, 384-dim
399
 
400
  # LLM Integration
401
+ Ollama: llama3:8b (100% local, maximum privacy)
402
+ Llama.cpp: GGUF models (local models on CPU/GPU)
403
  OpenAI: gpt-3.5-turbo, gpt-4
404
  Anthropic: claude-3-sonnet, claude-3-opus
405
 
406
+
407
+ # Privacy Levels:
408
+ 1. Ollama → 100% local, no data leaves
409
+ 2. Llama.cpp → Models run locally on your hardware
410
+ 3. OpenAI/Anthropic → Data sent to external servers
411
+
412
  # Deep Learning Framework
413
  PyTorch: 2.1+
414
  Transformers: 4.35+ (Hugging Face)
 
553
  LLM Manager: ~50MB
554
  Total (Idle): ~600MB
555
  Total (Peak): ~1.2GB
556
+
557
+ ```
558
+
559
+ ---
560
+
561
+ ## 🔒 Privacy & Data Safety
562
+
563
+ ### Data Handling by Deployment Type
564
+
565
+ | Deployment | Privacy Level | Where Models Run | Where Files Go | Best For |
566
+ |------------|---------------|------------------|----------------|----------|
567
+ | **Local Ollama** | 🔒 Maximum | Your machine | Your machine only | Sensitive NDAs, employment |
568
+ | **Local Llama.cpp** | 🔒 High | Your machine | Your machine only | General contracts, CPU-only |
569
+ | **HuggingFace Spaces** | 🟡 Medium | HF servers | Temporary HF storage | Testing, public demos |
570
+ | **External APIs** | 🟡 Medium | OpenAI/Anthropic | Sent to 3rd parties | Non-sensitive contracts |
571
+
572
+ ### Configuration for Different Privacy Needs
573
+
574
+ **For Maximum Privacy (Legal Firms, Sensitive Data):**
575
+ ```env
576
+ ENABLE_OLLAMA=true # 100% local
577
+ ENABLE_LLAMA_CPP=true # Local GGUF models
578
+ ENABLE_OPENAI=false # No external data
579
+ ENABLE_ANTHROPIC=false # No external data
580
  ```
581
 
582
+ **For Public Demos (HuggingFace Spaces):**
583
+
584
+ ```env
585
+ ENABLE_OLLAMA=false # Not available on HF
586
+ ENABLE_LLAMA_CPP=true # Local models on HF servers
587
+ ENABLE_OPENAI=false # Optional if API key added
588
+ ENABLE_ANTHROPIC=false # Optional if API key added
589
+ ```
590
+
591
+ > ⚠️ Important: No deployment option provides attorney-client privilege. Always consult a lawyer for legal advice.
592
+
593
  ---
594
 
595
  ## 📝 License
app.py CHANGED
@@ -298,7 +298,7 @@ class PreloadedAnalysisService:
298
  try:
299
  # Initialize with LLM manager - ensure constructor args match
300
  self.services["negotiation_engine"] = NegotiationEngine(llm_manager = self.llm_manager,
301
- default_provider = LLMProvider.OLLAMA,
302
  )
303
  self.service_status["negotiation_engine"] = "loaded"
304
 
@@ -314,7 +314,9 @@ class PreloadedAnalysisService:
314
  log_info("🔄 Pre-loading Summary Generator...")
315
  try:
316
  # Initialize with LLM manager
317
- self.services["summary_generator"] = SummaryGenerator(llm_manager = self.llm_manager)
 
 
318
  self.service_status["summary_generator"] = "loaded"
319
 
320
  log_info("✅ Summary Generator loaded")
@@ -594,7 +596,7 @@ class PreloadedAnalysisService:
594
  contract_type = contract_type_enum,
595
  overall_risk_score = risk_score.overall_score,
596
  max_clauses = len(clauses),
597
- provider = LLMProvider.OLLAMA,
598
  )
599
  log_info("LLM risk interpretation generated")
600
 
 
298
  try:
299
  # Initialize with LLM manager - ensure constructor args match
300
  self.services["negotiation_engine"] = NegotiationEngine(llm_manager = self.llm_manager,
301
+ default_provider = None,
302
  )
303
  self.service_status["negotiation_engine"] = "loaded"
304
 
 
314
  log_info("🔄 Pre-loading Summary Generator...")
315
  try:
316
  # Initialize with LLM manager
317
+ self.services["summary_generator"] = SummaryGenerator(llm_manager = self.llm_manager,
318
+ default_provider = None,
319
+ )
320
  self.service_status["summary_generator"] = "loaded"
321
 
322
  log_info("✅ Summary Generator loaded")
 
596
  contract_type = contract_type_enum,
597
  overall_risk_score = risk_score.overall_score,
598
  max_clauses = len(clauses),
599
+ provider = None,
600
  )
601
  log_info("LLM risk interpretation generated")
602
 
config/settings.py CHANGED
@@ -1,7 +1,10 @@
1
  # DEPENDENCIES
 
2
  from pathlib import Path
3
  from pydantic import Field
 
4
  from typing import Optional
 
5
  from pydantic_settings import BaseSettings
6
 
7
 
@@ -10,66 +13,112 @@ class Settings(BaseSettings):
10
  Application-wide settings: primary configuration source
11
  """
12
  # Application Info
13
- APP_NAME : str = "AI Contract Risk Analyzer"
14
- APP_VERSION : str = "1.0.0"
15
- API_PREFIX : str = "/api/v1/"
16
 
17
  # Server Configuration
18
- HOST : str = "0.0.0.0"
19
- PORT : int = 8000
20
- RELOAD : bool = True
21
- WORKERS : int = 1
22
 
23
  # CORS Settings
24
- CORS_ORIGINS : list = ["http://localhost:3000", "http://localhost:8000", "http://127.0.0.1:8000"]
25
- CORS_ALLOW_CREDENTIALS : bool = True
26
- CORS_ALLOW_METHODS : list = ["*"]
27
- CORS_ALLOW_HEADERS : list = ["*"]
28
 
29
  # File Upload Settings
30
- MAX_UPLOAD_SIZE : int = 10 * 1024 * 1024 # 10 MB
31
- ALLOWED_EXTENSIONS : list = [".pdf", ".docx", ".txt"]
32
- UPLOAD_DIR : Path = Path("uploads")
33
 
34
  # Model Management Settings
35
- MODEL_CACHE_SIZE : int = 3 # Number of models to keep in memory
36
- MODEL_DOWNLOAD_TIMEOUT : int = 1800 # 30 minutes
37
- USE_GPU : bool = True # Automatically detect and use GPU if available
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
 
 
 
 
 
 
39
  # External API Settings
40
- OLLAMA_BASE_URL : str = "http://localhost:11434"
41
- OLLAMA_MODEL : str = "llama3:8b"
42
- OLLAMA_TIMEOUT : int = 300
43
- OLLAMA_TEMPERATURE : float = 0.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- # External API Keys
46
- OPENAI_API_KEY : Optional[str] = None
47
- ANTHROPIC_API_KEY : Optional[str] = None
 
 
 
 
 
 
 
48
 
49
  # Analysis Limits
50
- MIN_CONTRACT_LENGTH : int = 300 # Minimum characters for valid contract
51
- MAX_CONTRACT_LENGTH : int = 500000 # Maximum characters (500KB text)
52
- MAX_CLAUSES_TO_ANALYZE : int = 100
53
 
54
  # Logging Settings
55
- LOG_LEVEL : str = "INFO"
56
- LOG_FORMAT : str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
57
- LOG_FILE : Optional[Path] = Path("logs/app.log")
58
 
59
  # Cache Settings
60
- ENABLE_CACHE : bool = True
61
- CACHE_TTL : int = 3600 # 1 hour
62
- CACHE_DIR : Path = Path("cache")
63
 
64
- # Rate Limiting Settings
65
- RATE_LIMIT_ENABLED : bool = True
66
- RATE_LIMIT_REQUESTS : int = 10
67
- RATE_LIMIT_PERIOD : int = 60 # seconds
68
 
69
- # PDF Report Settings
70
- PDF_FONT_SIZE : int = 10
71
- PDF_MARGIN : float = 0.5 # inches
72
- PDF_PAGE_SIZE : str = "letter"
73
 
74
 
75
  class Config:
@@ -78,14 +127,196 @@ class Settings(BaseSettings):
78
  case_sensitive = True
79
 
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  def __init__(self, **kwargs):
82
  super().__init__(**kwargs)
83
- # Ensure directories exist
84
  self.UPLOAD_DIR.mkdir(parents = True, exist_ok = True)
85
- self.CACHE_DIR.mkdir(parents = True, exist_ok = True)
 
86
 
87
  if self.LOG_FILE:
88
  self.LOG_FILE.parent.mkdir(parents = True, exist_ok = True)
 
89
 
90
 
91
  # Global settings instance
 
1
  # DEPENDENCIES
2
+ import os
3
  from pathlib import Path
4
  from pydantic import Field
5
+ from typing import Literal
6
  from typing import Optional
7
+ from pydantic import field_validator
8
  from pydantic_settings import BaseSettings
9
 
10
 
 
13
  Application-wide settings: primary configuration source
14
  """
15
  # Application Info
16
+ APP_NAME : str = "AI Contract Risk Analyzer"
17
+ APP_VERSION : str = "1.0.0"
18
+ API_PREFIX : str = "/api/v1/"
19
 
20
  # Server Configuration
21
+ HOST : str = "0.0.0.0"
22
+ PORT : int = 8000
23
+ RELOAD : bool = True
24
+ WORKERS : int = 1
25
 
26
  # CORS Settings
27
+ CORS_ORIGINS : list = ["http://localhost:3000", "http://localhost:8000", "http://127.0.0.1:8000"]
28
+ CORS_ALLOW_CREDENTIALS : bool = True
29
+ CORS_ALLOW_METHODS : list = ["*"]
30
+ CORS_ALLOW_HEADERS : list = ["*"]
31
 
32
  # File Upload Settings
33
+ MAX_UPLOAD_SIZE : int = 10 * 1024 * 1024 # 10 MB
34
+ ALLOWED_EXTENSIONS : list = [".pdf", ".docx", ".txt"]
35
+ UPLOAD_DIR : Path = Path("uploads")
36
 
37
  # Model Management Settings
38
+ MODEL_CACHE_SIZE : int = 3 # Number of models to keep in memory
39
+ MODEL_DOWNLOAD_TIMEOUT : int = 1800 # 30 minutes
40
+ USE_GPU : bool = True # Automatically detect and use GPU if available
41
+
42
+ # Environment Detection Settings
43
+ IS_HUGGINGFACE_SPACE : bool = False # Auto-detected
44
+ IS_LOCAL : bool = True # Auto-detected
45
+ DEPLOYMENT_ENV : Literal["local", "huggingface", "docker", "cloud"] = "local"
46
+
47
+ # LLAMA.CPP Settings (For HF Spaces)
48
+ LLAMA_CPP_ENABLED : bool = False # Auto-enabled in HF Spaces
49
+ LLAMA_CPP_MODEL_PATH : Optional[Path] = None # Local path to GGUF model
50
+ LLAMA_CPP_MODEL_REPO : str = "NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF"
51
+ LLAMA_CPP_MODEL_FILE : str = "Hermes-2-Pro-Llama-3-8B-GGUF.Q4_K_M.gguf"
52
+ LLAMA_CPP_N_CTX : int = 4096 # Context window
53
+ LLAMA_CPP_N_GPU_LAYERS : int = -1 # -1 = all layers on GPU
54
+ LLAMA_CPP_N_BATCH : int = 512 # Batch size for prompt processing
55
+ LLAMA_CPP_N_THREADS : int = 4 # CPU threads (0 = auto)
56
 
57
+ # Ollama Settings (For Local)
58
+ OLLAMA_BASE_URL : str = "http://localhost:11434"
59
+ OLLAMA_MODEL : str = "llama3:8b"
60
+ OLLAMA_TIMEOUT : int = 300
61
+ OLLAMA_TEMPERATURE : float = 0.1
62
+
63
  # External API Settings
64
+ OPENAI_API_KEY : Optional[str] = None
65
+ OPENAI_MODEL : str = "gpt-3.5-turbo"
66
+ OPENAI_TIMEOUT : int = 30
67
+ OPENAI_TEMPERATURE : float = 0.1
68
+ OPENAI_MAX_TOKENS : int = 1024
69
+
70
+ ANTHROPIC_API_KEY : Optional[str] = None
71
+ ANTHROPIC_MODEL : str = "claude-3-haiku-20240307"
72
+ ANTHROPIC_TIMEOUT : int = 30
73
+ ANTHROPIC_TEMPERATURE : float = 0.1
74
+ ANTHROPIC_MAX_TOKENS : int = 1024
75
+
76
+ # Priority order for LLM providers
77
+ LLM_PROVIDER_PRIORITY : list = ["ollama", "openai", "anthropic", "llama_cpp"]
78
+
79
+ # Which providers are available
80
+ ENABLE_OLLAMA : bool = True
81
+ ENABLE_LLAMA_CPP : bool = False # Auto-enabled in HF Spaces
82
+ ENABLE_OPENAI : bool = False
83
+ ENABLE_ANTHROPIC : bool = False
84
+ ENABLE_HF_INFERENCE : bool = False # HuggingFace Inference API
85
+
86
+ # Default provider (auto-selected based on environment)
87
+ LLM_DEFAULT_PROVIDER : str = "ollama"
88
 
89
+ # Huggingface Inference Settings (Optional)
90
+ HF_MODEL_ID : Optional[str] = None # e.g. "meta-llama/Llama-2-7b-chat-hf"
91
+ HF_API_TOKEN : Optional[str] = None # HF token for gated models
92
+
93
+ # LLM Generation Settings (Shared across providers)
94
+ LLM_TEMPERATURE : float = 0.1 # Default for all providers
95
+ LLM_MAX_TOKENS : int = 1024 # Default for all providers
96
+ LLM_TOP_P : float = 0.95 # Default top-p sampling
97
+ LLM_REPEAT_PENALTY : float = 1.1 # Default repeat penalty
98
+ LLM_SYSTEM_PROMPT : str = "You are a helpful legal assistant specializing in contract analysis and risk assessment."
99
 
100
  # Analysis Limits
101
+ MIN_CONTRACT_LENGTH : int = 300 # Minimum characters for valid contract
102
+ MAX_CONTRACT_LENGTH : int = 500000 # Maximum characters (500KB text)
103
+ MAX_CLAUSES_TO_ANALYZE : int = 100
104
 
105
  # Logging Settings
106
+ LOG_LEVEL : str = "INFO"
107
+ LOG_FORMAT : str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
108
+ LOG_FILE : Optional[Path] = Path("logs/app.log")
109
 
110
  # Cache Settings
111
+ ENABLE_CACHE : bool = True
112
+ CACHE_TTL : int = 3600 # 1 hour
113
+ CACHE_DIR : Path = Path("cache")
114
 
115
+ # Model Cache Directory (for llama.cpp models)
116
+ MODEL_CACHE_DIR : Path = Path("data/models")
 
 
117
 
118
+ # Rate Limiting Settings
119
+ RATE_LIMIT_ENABLED : bool = False
120
+ RATE_LIMIT_REQUESTS : int = 10
121
+ RATE_LIMIT_PERIOD : int = 60 # seconds
122
 
123
 
124
  class Config:
 
127
  case_sensitive = True
128
 
129
 
130
+ @field_validator('IS_HUGGINGFACE_SPACE', 'IS_LOCAL', 'DEPLOYMENT_ENV', mode = 'before')
131
+ def detect_environment(cls, v, info):
132
+ """
133
+ Auto-detect deployment environment
134
+ """
135
+ field_name = info.field_name
136
+
137
+ if (field_name == 'IS_HUGGINGFACE_SPACE'):
138
+ return bool(os.getenv('SPACE_ID'))
139
+
140
+ elif (field_name == 'IS_LOCAL'):
141
+ # Check if not in any container/cloud environment
142
+ return not any([os.getenv('SPACE_ID'),
143
+ os.getenv('DOCKER_CONTAINER'),
144
+ os.getenv('KUBERNETES_SERVICE_HOST'),
145
+ os.getenv('AWS_EXECUTION_ENV')
146
+ ])
147
+
148
+ elif (field_name == 'DEPLOYMENT_ENV'):
149
+ if os.getenv('SPACE_ID'):
150
+ return "huggingface"
151
+
152
+ elif os.getenv('DOCKER_CONTAINER'):
153
+ return "docker"
154
+
155
+ elif os.getenv('KUBERNETES_SERVICE_HOST'):
156
+ return "kubernetes"
157
+
158
+ elif os.getenv('AWS_EXECUTION_ENV'):
159
+ return "aws"
160
+
161
+ else:
162
+ return "local"
163
+
164
+ return v
165
+
166
+
167
+ @field_validator('ENABLE_LLAMA_CPP', 'LLAMA_CPP_ENABLED', mode = 'after')
168
+ def enable_llama_cpp_for_hf(cls, v, info):
169
+ """
170
+ Auto-enable llama.cpp for HuggingFace Spaces
171
+ """
172
+ values = info.data
173
+
174
+ if values.get('IS_HUGGINGFACE_SPACE'):
175
+ return True
176
+
177
+ return v
178
+
179
+
180
+ @field_validator('ENABLE_OLLAMA', mode = 'after')
181
+ def disable_ollama_for_hf(cls, v, info):
182
+ """
183
+ Auto-disable Ollama for HuggingFace Spaces
184
+ """
185
+ values = info.data
186
+
187
+ if values.get('IS_HUGGINGFACE_SPACE'):
188
+ return False
189
+
190
+ return v
191
+
192
+
193
+ @field_validator('LLM_PROVIDER_PRIORITY', mode='after')
194
+ def adjust_provider_priority(cls, v, info):
195
+ """
196
+ Adjust provider priority based on environment
197
+ """
198
+ values = info.data
199
+
200
+ if values.get('IS_HUGGINGFACE_SPACE'):
201
+ # For HF Spaces: llama_cpp first, then external APIs
202
+ priority = []
203
+
204
+ if (values.get('ENABLE_LLAMA_CPP')):
205
+ priority.append("llama_cpp")
206
+
207
+ if (values.get('ENABLE_HF_INFERENCE') and values.get('HF_API_TOKEN')):
208
+ priority.append("hf_inference")
209
+
210
+ if (values.get('ENABLE_OPENAI') and values.get('OPENAI_API_KEY')):
211
+ priority.append("openai")
212
+
213
+ if (values.get('ENABLE_ANTHROPIC') and values.get('ANTHROPIC_API_KEY')):
214
+ priority.append("anthropic")
215
+
216
+ return priority if priority else ["llama_cpp"]
217
+
218
+ else:
219
+ # For local: Ollama first
220
+ priority = list()
221
+
222
+ if values.get('ENABLE_OLLAMA'):
223
+ priority.append("ollama")
224
+
225
+ if values.get('ENABLE_LLAMA_CPP'):
226
+ priority.append("llama_cpp")
227
+
228
+ if values.get('ENABLE_OPENAI') and values.get('OPENAI_API_KEY'):
229
+ priority.append("openai")
230
+
231
+ if values.get('ENABLE_ANTHROPIC') and values.get('ANTHROPIC_API_KEY'):
232
+ priority.append("anthropic")
233
+
234
+ return priority if priority else ["ollama"]
235
+
236
+
237
+ @field_validator('LLM_DEFAULT_PROVIDER', mode='after')
238
+ def set_default_provider(cls, v, info):
239
+ """
240
+ Set default provider based on availability
241
+ """
242
+ values = info.data
243
+
244
+ # Get the priority list (after adjustments)
245
+ priority = values.get('LLM_PROVIDER_PRIORITY', [])
246
+
247
+ if priority:
248
+ # First available provider is default
249
+ return priority[0]
250
+
251
+ # Fallback
252
+ return "ollama"
253
+
254
+
255
+ @field_validator('MODEL_CACHE_DIR')
256
+ def set_model_cache_dir(cls, v, info):
257
+ """
258
+ Set appropriate model cache directory based on environment
259
+ """
260
+ values = info.data
261
+
262
+ if (values.get('IS_HUGGINGFACE_SPACE')):
263
+ # HF Spaces have persistent /data directory
264
+ return Path("/data/models")
265
+
266
+ elif (values.get('DEPLOYMENT_ENV') == "docker"):
267
+ # Docker containers
268
+ return Path("/app/models")
269
+
270
+ else:
271
+ # Local development
272
+ return Path("models")
273
+
274
+
275
+ @field_validator('LLAMA_CPP_N_GPU_LAYERS')
276
+ def optimize_gpu_layers(cls, v, info):
277
+ """
278
+ Auto-optimize GPU layers for different environments
279
+ """
280
+ values = info.data
281
+
282
+ if values.get('IS_HUGGINGFACE_SPACE'):
283
+ # HF Spaces: T4 GPU with 15-16GB VRAM
284
+ # For 8B Q4 model: ~20 layers is safe
285
+ return 20
286
+
287
+ elif v == -1: # -1 means "use all layers"
288
+ # For local with sufficient GPU
289
+ return -1
290
+
291
+ else:
292
+ # Explicit value from config
293
+ return v
294
+
295
+
296
+ @field_validator('LLAMA_CPP_MODEL_PATH')
297
+ def set_default_model_path(cls, v, info):
298
+ """
299
+ Set default model path if not specified
300
+ """
301
+ values = info.data
302
+
303
+ if v is None and values.get('LLAMA_CPP_MODEL_FILE'):
304
+ cache_dir = values.get('MODEL_CACHE_DIR', Path("models"))
305
+ return cache_dir / values['LLAMA_CPP_MODEL_FILE']
306
+
307
+ return v
308
+
309
+
310
  def __init__(self, **kwargs):
311
  super().__init__(**kwargs)
312
+ # Ensure Directories Exist
313
  self.UPLOAD_DIR.mkdir(parents = True, exist_ok = True)
314
+ self.CACHE_DIR.mkdir(parents=True, exist_ok = True)
315
+ self.MODEL_CACHE_DIR.mkdir(parents = True, exist_ok = True)
316
 
317
  if self.LOG_FILE:
318
  self.LOG_FILE.parent.mkdir(parents = True, exist_ok = True)
319
+
320
 
321
 
322
  # Global settings instance
model_manager/llm_manager.py CHANGED
@@ -3,6 +3,7 @@ import sys
3
  import json
4
  import time
5
  import requests
 
6
  from enum import Enum
7
  from typing import Any
8
  from typing import Dict
@@ -37,14 +38,24 @@ try:
37
  except ImportError:
38
  ANTHROPIC_AVAILABLE = False
39
 
 
 
 
 
 
 
40
 
 
 
41
  class LLMProvider(Enum):
42
  """
43
  Supported LLM providers
44
  """
45
- OLLAMA = "ollama"
46
- OPENAI = "openai"
47
- ANTHROPIC = "anthropic"
 
 
48
 
49
 
50
  @dataclass
@@ -78,16 +89,16 @@ class LLMResponse:
78
 
79
  class LLMManager:
80
  """
81
- Unified LLM manager for multiple providers : handles Ollama (local), OpenAI API, and Anthropic API
82
  """
83
- def __init__(self, default_provider: LLMProvider = LLMProvider.OLLAMA, ollama_base_url: Optional[str] = None,
84
  openai_api_key: Optional[str] = None, anthropic_api_key: Optional[str] = None):
85
  """
86
  Initialize LLM Manager
87
 
88
  Arguments:
89
  ----------
90
- default_provider : Default LLM provider to use
91
 
92
  ollama_base_url : Ollama server URL (default: from settings)
93
 
@@ -95,7 +106,7 @@ class LLMManager:
95
 
96
  anthropic_api_key : Anthropic API key (or set ANTHROPIC_API_KEY env var)
97
  """
98
- self.default_provider = default_provider
99
  self.logger = ContractAnalyzerLogger.get_logger()
100
 
101
  # Configuration Variables Initialization
@@ -122,24 +133,44 @@ class LLMManager:
122
  else:
123
  self.anthropic_client = None
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  # Rate limiting (simple token bucket)
126
  self._rate_limit_tokens = settings.RATE_LIMIT_REQUESTS
127
  self._rate_limit_last_refill = time.time()
128
  self._rate_limit_refill_rate = settings.RATE_LIMIT_REQUESTS / settings.RATE_LIMIT_PERIOD
129
 
130
- # Generation settings
131
- self.generation_config = self.config.LLM_GENERATION
 
 
 
 
132
 
133
  log_info("LLMManager initialized",
134
- default_provider = default_provider.value,
135
- ollama_base_url = self.ollama_base_url,
136
- ollama_model = self.ollama_model,
137
- ollama_timeout = self.ollama_timeout,
138
- ollama_temperature = self.ollama_temperature,
139
- openai_available = OPENAI_AVAILABLE and bool(self.openai_api_key),
140
- anthropic_available = ANTHROPIC_AVAILABLE and bool(self.anthropic_api_key),
141
- rate_limit_requests = settings.RATE_LIMIT_REQUESTS,
142
- rate_limit_period = settings.RATE_LIMIT_PERIOD,
143
  )
144
 
145
 
@@ -148,6 +179,9 @@ class LLMManager:
148
  """
149
  Check if Ollama server is available
150
  """
 
 
 
151
  try:
152
  response = requests.get(f"{self.ollama_base_url}/api/tags", timeout = 30)
153
  available = (response.status_code == 200)
@@ -165,20 +199,35 @@ class LLMManager:
165
 
166
  def get_available_providers(self) -> List[LLMProvider]:
167
  """
168
- Get list of available providers
169
  """
170
  available = list()
171
 
172
- if self._check_ollama_available():
 
173
  available.append(LLMProvider.OLLAMA)
174
 
175
- if OPENAI_AVAILABLE and self.openai_api_key:
176
  available.append(LLMProvider.OPENAI)
177
 
178
- if ANTHROPIC_AVAILABLE and self.anthropic_api_key:
179
  available.append(LLMProvider.ANTHROPIC)
180
 
181
- log_info("Available LLM providers", providers = [p.value for p in available])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
  return available
184
 
@@ -188,8 +237,11 @@ class LLMManager:
188
  """
189
  Check if rate limit allows request (simple token bucket)
190
  """
191
- now = time.time()
192
- time_passed = now - self._rate_limit_last_refill
 
 
 
193
 
194
  # Refill tokens
195
  self._rate_limit_tokens = min(settings.RATE_LIMIT_REQUESTS, self._rate_limit_tokens + time_passed * self._rate_limit_refill_rate)
@@ -197,7 +249,6 @@ class LLMManager:
197
 
198
  if (self._rate_limit_tokens >= 1):
199
  self._rate_limit_tokens -= 1
200
-
201
  return True
202
 
203
  log_info("Rate limit hit, waiting...", tokens_remaining = self._rate_limit_tokens)
@@ -216,10 +267,10 @@ class LLMManager:
216
  # UNIFIED COMPLETION METHOD
217
  @ContractAnalyzerLogger.log_execution_time("llm_complete")
218
  def complete(self, prompt: str, provider: Optional[LLMProvider] = None, model: Optional[str] = None, temperature: Optional[float] = None,
219
- max_tokens: Optional[int] = None, system_prompt: Optional[str] = None, json_mode: bool = False, retry_on_error: bool = True,
220
- fallback_providers: Optional[List[LLMProvider]] = None) -> LLMResponse:
221
  """
222
- Unified completion method for all providers
223
 
224
  Arguments:
225
  ----------
@@ -229,9 +280,9 @@ class LLMManager:
229
 
230
  model : Model name (provider-specific)
231
 
232
- temperature : Sampling temperature (0.0-1.0, default from settings/config)
233
 
234
- max_tokens : Maximum tokens to generate (default from config)
235
 
236
  system_prompt : System prompt (if supported)
237
 
@@ -239,15 +290,16 @@ class LLMManager:
239
 
240
  retry_on_error : Retry with fallback providers on error
241
 
242
- fallback_providers : List of fallback providers to try
243
 
244
  Returns:
245
  --------
246
  { LLMResponse } : LLMResponse object
247
  """
248
- provider = provider or self.default_provider
249
- temperature = temperature or self.ollama_temperature
250
- max_tokens = max_tokens or self.generation_config["max_tokens"]
 
251
 
252
  log_info("LLM completion request",
253
  provider = provider.value,
@@ -260,74 +312,116 @@ class LLMManager:
260
  # Rate limiting
261
  self._wait_for_rate_limit()
262
 
263
- # Try primary provider
264
- try:
265
- if (provider == LLMProvider.OLLAMA):
266
- return self._complete_ollama(prompt = prompt,
267
- model = model,
268
- temperature = temperature,
269
- max_tokens = max_tokens,
270
- system_prompt = system_prompt,
271
- json_mode = json_mode,
272
- )
273
-
274
- elif (provider == LLMProvider.OPENAI):
275
- return self._complete_openai(prompt = prompt,
276
- model = model,
277
- temperature = temperature,
278
- max_tokens = max_tokens,
279
- system_prompt = system_prompt,
280
- json_mode = json_mode,
281
- )
282
-
283
- elif (provider == LLMProvider.ANTHROPIC):
284
- return self._complete_anthropic(prompt = prompt,
285
- model = model,
286
- temperature = temperature,
287
- max_tokens = max_tokens,
288
- system_prompt = system_prompt,
289
- )
290
-
291
- else:
292
- raise ValueError(f"Unsupported provider: {provider}")
293
-
294
- except Exception as e:
295
- log_error(e, context = {"component" : "LLMManager", "operation" : "complete", "provider" : provider.value})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
 
297
- # Try fallback providers
298
- if (retry_on_error and fallback_providers):
299
- log_info("Trying fallback providers", fallbacks = [p.value for p in fallback_providers])
 
 
 
 
300
 
301
- for fallback_provider in fallback_providers:
302
- if (fallback_provider == provider):
303
- continue
 
 
 
 
 
 
 
 
304
 
305
- try:
306
- log_info(f"Attempting fallback to {fallback_provider.value}")
307
- # Prevent infinite recursion
308
- return self.complete(prompt = prompt,
309
- provider = fallback_provider,
310
- model = model,
311
- temperature = temperature,
312
- max_tokens = max_tokens,
313
- system_prompt = system_prompt,
314
- json_mode = json_mode,
315
- retry_on_error = False,
316
- )
317
-
318
- except Exception as fallback_error:
319
- log_error(fallback_error, context = {"component" : "LLMManager", "operation" : "fallback_complete", "provider" : fallback_provider.value})
320
- continue
321
-
322
- # All attempts failed
323
- return LLMResponse(text = "",
324
- provider = provider.value,
325
- model = model or "unknown",
326
- tokens_used = 0,
327
- latency_seconds = 0.0,
328
- success = False,
329
- error_message = str(e),
330
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
 
332
 
333
  # OLLAMA Provider
@@ -335,6 +429,9 @@ class LLMManager:
335
  """
336
  Complete using local Ollama
337
  """
 
 
 
338
  start_time = time.time()
339
  model = model or self.ollama_model
340
 
@@ -359,7 +456,11 @@ class LLMManager:
359
  json_mode = json_mode,
360
  )
361
 
362
- response = requests.post(f"{self.ollama_base_url}/api/generate", json = payload, timeout = self.ollama_timeout)
 
 
 
 
363
  response.raise_for_status()
364
 
365
  result = response.json()
@@ -391,11 +492,14 @@ class LLMManager:
391
  """
392
  Complete using OpenAI API
393
  """
 
 
 
394
  if not OPENAI_AVAILABLE or not self.openai_api_key:
395
  raise ValueError("OpenAI not available. Install with: pip install openai")
396
 
397
  start_time = time.time()
398
- model = model or "gpt-3.5-turbo"
399
 
400
  # Construct messages
401
  messages = list()
@@ -443,11 +547,14 @@ class LLMManager:
443
  """
444
  Complete using Anthropic (Claude) API
445
  """
 
 
 
446
  if not ANTHROPIC_AVAILABLE or not self.anthropic_client:
447
  raise ValueError("Anthropic not available. Install with: pip install anthropic")
448
 
449
- start_time = time.time()
450
- model = model or "claude-3-sonnet-20240229"
451
 
452
  log_info("Calling Anthropic API", model = model)
453
 
@@ -455,7 +562,7 @@ class LLMManager:
455
  message = self.anthropic_client.messages.create(model = model,
456
  max_tokens = max_tokens,
457
  temperature = temperature,
458
- system = system_prompt or "",
459
  messages = [{"role": "user", "content": prompt}],
460
  )
461
 
@@ -475,6 +582,186 @@ class LLMManager:
475
  )
476
 
477
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478
  # Specialized Methods
479
  def generate_structured_json(self, prompt: str, schema_description: str, provider: Optional[LLMProvider] = None, **kwargs) -> Dict[str, Any]:
480
  """
@@ -526,98 +813,6 @@ class LLMManager:
526
  raise ValueError(f"Failed to parse JSON response: {e}")
527
 
528
 
529
- def batch_complete(self, prompts: List[str], provider: Optional[LLMProvider] = None, **kwargs) -> List[LLMResponse]:
530
- """
531
- Complete multiple prompts (sequential for now)
532
-
533
- Arguments:
534
- ----------
535
- prompts : List of prompts
536
-
537
- provider : LLM provider
538
-
539
- **kwargs : Additional arguments for complete()
540
-
541
- Returns:
542
- --------
543
- { list } : List of LLMResponse objects
544
- """
545
- log_info("Batch completion started", batch_size=len(prompts))
546
-
547
- responses = list()
548
-
549
- for i, prompt in enumerate(prompts):
550
- log_info(f"Processing prompt {i+1}/{len(prompts)}")
551
-
552
- response = self.complete(prompt = prompt,
553
- provider = provider,
554
- **kwargs,
555
- )
556
-
557
- responses.append(response)
558
-
559
- successful = sum(1 for r in responses if r.success)
560
-
561
- log_info("Batch completion finished",
562
- total = len(prompts),
563
- successful = successful,
564
- failed = len(prompts) - successful,
565
- )
566
-
567
- return responses
568
-
569
-
570
- # OLLAMA-Specific Methods
571
- def list_ollama_models(self) -> List[str]:
572
- """
573
- List available local Ollama models
574
- """
575
- try:
576
- response = requests.get(f"{self.ollama_base_url}/api/tags", timeout = 30)
577
- response.raise_for_status()
578
-
579
- models = [model['name'] for model in response.json().get('models', [])]
580
-
581
- log_info("Ollama models listed", count = len(models), models = models)
582
-
583
- return models
584
-
585
- except Exception as e:
586
- log_error(e, context = {"component" : "LLMManager", "operation" : "list_ollama_models"})
587
- return []
588
-
589
-
590
- def pull_ollama_model(self, model_name: str) -> bool:
591
- """
592
- Pull/download an Ollama model
593
- """
594
- try:
595
- log_info(f"Pulling Ollama model: {model_name}")
596
-
597
- response = requests.post(f"{self.ollama_base_url}/api/pull",
598
- json = {"name": model_name},
599
- stream = True,
600
- timeout = 600, # 10 minutes for download
601
- )
602
-
603
- response.raise_for_status()
604
-
605
- # Stream response to track progress
606
- for line in response.iter_lines():
607
- if line:
608
- data = json.loads(line)
609
-
610
- if ('status' in data):
611
- log_info(f"Pull status: {data['status']}")
612
-
613
- log_info(f"Model pulled successfully: {model_name}")
614
- return True
615
-
616
- except Exception as e:
617
- log_error(e, context = {"component" : "LLMManager", "operation" : "pull_ollama_model", "model" : model_name})
618
- return False
619
-
620
-
621
  # Utility Methods
622
  def get_provider_info(self, provider: LLMProvider) -> Dict[str, Any]:
623
  """
@@ -629,29 +824,32 @@ class LLMManager:
629
  }
630
 
631
  if (provider == LLMProvider.OLLAMA):
632
- info["available"] = self._check_ollama_available()
633
 
634
  if info["available"]:
635
  info["models"] = self.list_ollama_models()
636
  info["base_url"] = self.ollama_base_url
637
 
638
  elif (provider == LLMProvider.OPENAI):
639
- info["available"] = OPENAI_AVAILABLE and bool(self.openai_api_key)
640
 
641
  if info["available"]:
642
- info["models"] = ["gpt-3.5-turbo",
643
- "gpt-4",
644
- "gpt-4-turbo-preview",
645
- ]
646
 
647
  elif (provider == LLMProvider.ANTHROPIC):
648
- info["available"] = ANTHROPIC_AVAILABLE and bool(self.anthropic_client)
649
 
650
  if info["available"]:
651
- info["models"] = ["claude-3-opus-20240229",
652
- "claude-3-sonnet-20240229",
653
- "claude-3-haiku-20240307",
654
- ]
 
 
 
 
 
 
655
 
656
  return info
657
 
@@ -674,26 +872,26 @@ class LLMManager:
674
  --------
675
  { float } : Estimated cost in USD
676
  """
677
- # Pricing per 1K tokens (as of 2025)
678
- pricing = {"openai" : {"gpt-3.5-turbo" : {"prompt": 0.0015, "completion": 0.002},
679
- "gpt-4" : {"prompt": 0.03, "completion": 0.06},
680
- "gpt-4-turbo-preview" : {"prompt": 0.01, "completion": 0.03},
681
- },
682
- "anthropic" : {"claude-3-opus-20240229" : {"prompt": 0.015, "completion": 0.075},
683
- "claude-3-sonnet-20240229" : {"prompt": 0.003, "completion": 0.015},
684
- "claude-3-haiku-20240307" : {"prompt": 0.00025, "completion": 0.00125},
685
- }
686
- }
687
 
688
- if (provider == LLMProvider.OLLAMA):
689
- # Local models are free
690
- return 0.0
 
 
 
 
 
 
 
691
 
692
  provider_pricing = pricing.get(provider.value, {}).get(model)
693
 
694
  if not provider_pricing:
695
  return 0.0
696
 
697
- cost = ((prompt_tokens / 1000) * provider_pricing["prompt"] + (completion_tokens / 1000) * provider_pricing["completion"])
698
 
699
- return round(cost, 6)
 
3
  import json
4
  import time
5
  import requests
6
+ import threading
7
  from enum import Enum
8
  from typing import Any
9
  from typing import Dict
 
38
  except ImportError:
39
  ANTHROPIC_AVAILABLE = False
40
 
41
+ try:
42
+ from llama_cpp import Llama
43
+ LLAMA_CPP_AVAILABLE = True
44
+
45
+ except ImportError:
46
+ LLAMA_CPP_AVAILABLE = False
47
 
48
+
49
+ # Enums and models
50
  class LLMProvider(Enum):
51
  """
52
  Supported LLM providers
53
  """
54
+ OLLAMA = "ollama"
55
+ OPENAI = "openai"
56
+ ANTHROPIC = "anthropic"
57
+ LLAMA_CPP = "llama_cpp"
58
+ HF_INFER = "hf_inference"
59
 
60
 
61
  @dataclass
 
89
 
90
  class LLMManager:
91
  """
92
+ Unified LLM manager for multiple providers : handles Ollama (local), OpenAI API, Anthropic API, and Llama.cpp
93
  """
94
+ def __init__(self, default_provider: Optional[LLMProvider] = None, ollama_base_url: Optional[str] = None,
95
  openai_api_key: Optional[str] = None, anthropic_api_key: Optional[str] = None):
96
  """
97
  Initialize LLM Manager
98
 
99
  Arguments:
100
  ----------
101
+ default_provider : Default LLM provider to use (if None, uses settings.LLM_DEFAULT_PROVIDER)
102
 
103
  ollama_base_url : Ollama server URL (default: from settings)
104
 
 
106
 
107
  anthropic_api_key : Anthropic API key (or set ANTHROPIC_API_KEY env var)
108
  """
109
+ self.default_provider = default_provider or LLMProvider(settings.LLM_DEFAULT_PROVIDER)
110
  self.logger = ContractAnalyzerLogger.get_logger()
111
 
112
  # Configuration Variables Initialization
 
133
  else:
134
  self.anthropic_client = None
135
 
136
+ # Llama.cpp configuration (lazy loaded)
137
+ self.llama_cpp_model = None
138
+ self.llama_cpp_lock = threading.Lock()
139
+
140
+ # HuggingFace Inference configuration
141
+ self.hf_client = None
142
+
143
+ if (settings.ENABLE_HF_INFERENCE and settings.HF_API_TOKEN):
144
+ try:
145
+ from huggingface_hub import InferenceClient
146
+
147
+ self.hf_client = InferenceClient(model = settings.HF_MODEL_ID,
148
+ token = settings.HF_API_TOKEN,
149
+ )
150
+ except ImportError:
151
+ log_error("huggingface_hub not installed, HF Inference disabled")
152
+
153
  # Rate limiting (simple token bucket)
154
  self._rate_limit_tokens = settings.RATE_LIMIT_REQUESTS
155
  self._rate_limit_last_refill = time.time()
156
  self._rate_limit_refill_rate = settings.RATE_LIMIT_REQUESTS / settings.RATE_LIMIT_PERIOD
157
 
158
+ # Generation settings from settings (not ModelConfig)
159
+ self.generation_config = {"max_tokens" : settings.LLM_MAX_TOKENS,
160
+ "temperature" : settings.LLM_TEMPERATURE,
161
+ "top_p" : settings.LLM_TOP_P,
162
+ "repeat_penalty" : settings.LLM_REPEAT_PENALTY,
163
+ }
164
 
165
  log_info("LLMManager initialized",
166
+ default_provider = self.default_provider.value,
167
+ deployment_env = settings.DEPLOYMENT_ENV,
168
+ ollama_enabled = settings.ENABLE_OLLAMA,
169
+ llama_cpp_enabled = settings.ENABLE_LLAMA_CPP,
170
+ openai_available = OPENAI_AVAILABLE and bool(self.openai_api_key),
171
+ anthropic_available = ANTHROPIC_AVAILABLE and bool(self.anthropic_api_key),
172
+ llama_cpp_available = LLAMA_CPP_AVAILABLE,
173
+ provider_priority = settings.LLM_PROVIDER_PRIORITY,
 
174
  )
175
 
176
 
 
179
  """
180
  Check if Ollama server is available
181
  """
182
+ if not settings.ENABLE_OLLAMA:
183
+ return False
184
+
185
  try:
186
  response = requests.get(f"{self.ollama_base_url}/api/tags", timeout = 30)
187
  available = (response.status_code == 200)
 
199
 
200
  def get_available_providers(self) -> List[LLMProvider]:
201
  """
202
+ Get list of available providers based on settings and environment
203
  """
204
  available = list()
205
 
206
+ # Check each provider based on settings
207
+ if (settings.ENABLE_OLLAMA and self._check_ollama_available()):
208
  available.append(LLMProvider.OLLAMA)
209
 
210
+ if (settings.ENABLE_OPENAI and OPENAI_AVAILABLE and self.openai_api_key):
211
  available.append(LLMProvider.OPENAI)
212
 
213
+ if (settings.ENABLE_ANTHROPIC and ANTHROPIC_AVAILABLE and self.anthropic_api_key):
214
  available.append(LLMProvider.ANTHROPIC)
215
 
216
+ if (settings.ENABLE_LLAMA_CPP and LLAMA_CPP_AVAILABLE):
217
+ available.append(LLMProvider.LLAMA_CPP)
218
+
219
+ if (settings.ENABLE_HF_INFERENCE and self.hf_client):
220
+ available.append(LLMProvider.HF_INFER)
221
+
222
+ # Sort by priority from settings
223
+ priority_order = settings.LLM_PROVIDER_PRIORITY
224
+
225
+ available.sort(key = lambda p: priority_order.index(p.value) if p.value in priority_order else len(priority_order))
226
+
227
+ log_info("Available LLM providers",
228
+ providers = [p.value for p in available],
229
+ priority = priority_order,
230
+ )
231
 
232
  return available
233
 
 
237
  """
238
  Check if rate limit allows request (simple token bucket)
239
  """
240
+ if not settings.RATE_LIMIT_ENABLED:
241
+ return True
242
+
243
+ now = time.time()
244
+ time_passed = now - self._rate_limit_last_refill
245
 
246
  # Refill tokens
247
  self._rate_limit_tokens = min(settings.RATE_LIMIT_REQUESTS, self._rate_limit_tokens + time_passed * self._rate_limit_refill_rate)
 
249
 
250
  if (self._rate_limit_tokens >= 1):
251
  self._rate_limit_tokens -= 1
 
252
  return True
253
 
254
  log_info("Rate limit hit, waiting...", tokens_remaining = self._rate_limit_tokens)
 
267
  # UNIFIED COMPLETION METHOD
268
  @ContractAnalyzerLogger.log_execution_time("llm_complete")
269
  def complete(self, prompt: str, provider: Optional[LLMProvider] = None, model: Optional[str] = None, temperature: Optional[float] = None,
270
+ max_tokens: Optional[int] = None, system_prompt: Optional[str] = None, json_mode: bool = False, retry_on_error: bool = True,
271
+ max_retries: int = 3) -> LLMResponse:
272
  """
273
+ Unified completion method for all providers with automatic fallback
274
 
275
  Arguments:
276
  ----------
 
280
 
281
  model : Model name (provider-specific)
282
 
283
+ temperature : Sampling temperature (0.0-1.0, default from settings)
284
 
285
+ max_tokens : Maximum tokens to generate (default from settings)
286
 
287
  system_prompt : System prompt (if supported)
288
 
 
290
 
291
  retry_on_error : Retry with fallback providers on error
292
 
293
+ max_retries : Maximum number of retry attempts
294
 
295
  Returns:
296
  --------
297
  { LLMResponse } : LLMResponse object
298
  """
299
+ provider = provider or self.default_provider
300
+ temperature = temperature or settings.LLM_TEMPERATURE
301
+ max_tokens = max_tokens or settings.LLM_MAX_TOKENS
302
+ system_prompt = system_prompt or settings.LLM_SYSTEM_PROMPT
303
 
304
  log_info("LLM completion request",
305
  provider = provider.value,
 
312
  # Rate limiting
313
  self._wait_for_rate_limit()
314
 
315
+ # Try primary provider with retries
316
+ for attempt in range(max_retries if retry_on_error else 1):
317
+ try:
318
+ if (provider == LLMProvider.OLLAMA):
319
+ return self._complete_ollama(prompt = prompt,
320
+ model = model,
321
+ temperature = temperature,
322
+ max_tokens = max_tokens,
323
+ system_prompt = system_prompt,
324
+ json_mode = json_mode,
325
+ )
326
+
327
+ elif (provider == LLMProvider.OPENAI):
328
+ return self._complete_openai(prompt = prompt,
329
+ model = model,
330
+ temperature = temperature,
331
+ max_tokens = max_tokens,
332
+ system_prompt = system_prompt,
333
+ json_mode = json_mode,
334
+ )
335
+
336
+ elif (provider == LLMProvider.ANTHROPIC):
337
+ return self._complete_anthropic(prompt = prompt,
338
+ model = model,
339
+ temperature = temperature,
340
+ max_tokens = max_tokens,
341
+ system_prompt = system_prompt,
342
+ )
343
+
344
+ elif (provider == LLMProvider.LLAMA_CPP):
345
+ return self._complete_llama_cpp(prompt = prompt,
346
+ model = model,
347
+ temperature = temperature,
348
+ max_tokens = max_tokens,
349
+ system_prompt = system_prompt,
350
+ json_mode = json_mode,
351
+ )
352
+
353
+ elif (provider == LLMProvider.HF_INFER):
354
+ return self._complete_hf_inference(prompt = prompt,
355
+ model = model,
356
+ temperature = temperature,
357
+ max_tokens = max_tokens,
358
+ system_prompt = system_prompt,
359
+ )
360
+
361
+ else:
362
+ raise ValueError(f"Unsupported provider: {provider}")
363
 
364
+ except Exception as e:
365
+ log_error(e, context = {"component" : "LLMManager",
366
+ "operation" : "complete",
367
+ "provider" : provider.value,
368
+ "attempt" : attempt + 1,
369
+ }
370
+ )
371
 
372
+ if (attempt < max_retries - 1):
373
+ log_info(f"Retrying attempt {attempt + 2}/{max_retries}")
374
+ # Exponential backoff
375
+ time.sleep(1 * (attempt + 1))
376
+ continue
377
+
378
+ # If retries exhausted, try fallback providers
379
+ if retry_on_error:
380
+ available_providers = self.get_available_providers()
381
+ # Remove current provider from fallback list
382
+ fallback_providers = [p for p in available_providers if p != provider]
383
 
384
+ for fallback_provider in fallback_providers:
385
+ try:
386
+ log_info(f"Attempting fallback to {fallback_provider.value}")
387
+ # Prevent infinite recursion by disabling further fallbacks
388
+ return self.complete(prompt = prompt,
389
+ provider = fallback_provider,
390
+ model = model,
391
+ temperature = temperature,
392
+ max_tokens = max_tokens,
393
+ system_prompt = system_prompt,
394
+ json_mode = json_mode,
395
+ retry_on_error = False, # No more fallbacks
396
+ )
397
+
398
+ except Exception as fallback_error:
399
+ log_error(fallback_error, context = {"component" : "LLMManager",
400
+ "operation" : "fallback_complete",
401
+ "provider" : fallback_provider.value,
402
+ }
403
+ )
404
+ continue
405
+
406
+ # All attempts failed
407
+ return LLMResponse(text = "",
408
+ provider = provider.value,
409
+ model = model or "unknown",
410
+ tokens_used = 0,
411
+ latency_seconds = 0.0,
412
+ success = False,
413
+ error_message = str(e),
414
+ )
415
+
416
+ # Should never reach here
417
+ return LLMResponse(text = "",
418
+ provider = provider.value,
419
+ model = model or "unknown",
420
+ tokens_used = 0,
421
+ latency_seconds = 0.0,
422
+ success = False,
423
+ error_message = "Unknown error",
424
+ )
425
 
426
 
427
  # OLLAMA Provider
 
429
  """
430
  Complete using local Ollama
431
  """
432
+ if not settings.ENABLE_OLLAMA:
433
+ raise ValueError("Ollama is disabled in settings")
434
+
435
  start_time = time.time()
436
  model = model or self.ollama_model
437
 
 
456
  json_mode = json_mode,
457
  )
458
 
459
+ response = requests.post(f"{self.ollama_base_url}/api/generate",
460
+ json = payload,
461
+ timeout = self.ollama_timeout,
462
+ )
463
+
464
  response.raise_for_status()
465
 
466
  result = response.json()
 
492
  """
493
  Complete using OpenAI API
494
  """
495
+ if not settings.ENABLE_OPENAI:
496
+ raise ValueError("OpenAI is disabled in settings")
497
+
498
  if not OPENAI_AVAILABLE or not self.openai_api_key:
499
  raise ValueError("OpenAI not available. Install with: pip install openai")
500
 
501
  start_time = time.time()
502
+ model = model or settings.OPENAI_MODEL
503
 
504
  # Construct messages
505
  messages = list()
 
547
  """
548
  Complete using Anthropic (Claude) API
549
  """
550
+ if not settings.ENABLE_ANTHROPIC:
551
+ raise ValueError("Anthropic is disabled in settings")
552
+
553
  if not ANTHROPIC_AVAILABLE or not self.anthropic_client:
554
  raise ValueError("Anthropic not available. Install with: pip install anthropic")
555
 
556
+ start_time = time.time()
557
+ model = model or settings.ANTHROPIC_MODEL
558
 
559
  log_info("Calling Anthropic API", model = model)
560
 
 
562
  message = self.anthropic_client.messages.create(model = model,
563
  max_tokens = max_tokens,
564
  temperature = temperature,
565
+ system = system_prompt or settings.LLM_SYSTEM_PROMPT,
566
  messages = [{"role": "user", "content": prompt}],
567
  )
568
 
 
582
  )
583
 
584
 
585
+ # Llama.cpp Provider
586
+ def _complete_llama_cpp(self, prompt: str, model: Optional[str], temperature: float, max_tokens: int, system_prompt: Optional[str], json_mode: bool) -> LLMResponse:
587
+ """
588
+ Complete using Llama.cpp (GGUF models)
589
+ """
590
+ if not settings.ENABLE_LLAMA_CPP:
591
+ raise ValueError("Llama.cpp is disabled in settings")
592
+
593
+ if not LLAMA_CPP_AVAILABLE:
594
+ raise ValueError("llama-cpp-python not installed. Install with: pip install llama-cpp-python")
595
+
596
+ start_time = time.time()
597
+
598
+ # Lazy load the model
599
+ with self.llama_cpp_lock:
600
+ if self.llama_cpp_model is None:
601
+ self._load_llama_cpp_model()
602
+
603
+ # Construct full prompt
604
+ system_prompt = system_prompt or settings.LLM_SYSTEM_PROMPT
605
+
606
+ full_prompt = f"""
607
+ {system_prompt}
608
+
609
+ {prompt}
610
+
611
+ Response:
612
+ """
613
+
614
+ log_info("Calling Llama.cpp",
615
+ model_path = str(settings.LLAMA_CPP_MODEL_PATH),
616
+ n_ctx = settings.LLAMA_CPP_N_CTX,
617
+ json_mode = json_mode,
618
+ )
619
+
620
+ # Generate response
621
+ response = self.llama_cpp_model(prompt = full_prompt,
622
+ max_tokens = max_tokens,
623
+ temperature = temperature,
624
+ top_p = settings.LLM_TOP_P,
625
+ repeat_penalty = settings.LLM_REPEAT_PENALTY,
626
+ stop = ["\n\n", "###", "Human:", "Assistant:", "</s>"],
627
+ echo = False,
628
+ )
629
+
630
+ generated_text = response['choices'][0]['text'].strip()
631
+ latency = time.time() - start_time
632
+
633
+ # Rough token estimation
634
+ tokens_used = len(full_prompt.split()) + len(generated_text.split())
635
+
636
+ log_info("Llama.cpp completion successful",
637
+ tokens_used = tokens_used,
638
+ latency_seconds = round(latency, 3),
639
+ )
640
+
641
+ return LLMResponse(text = generated_text,
642
+ provider = "llama_cpp",
643
+ model = str(settings.LLAMA_CPP_MODEL_PATH),
644
+ tokens_used = tokens_used,
645
+ latency_seconds = latency,
646
+ success = True,
647
+ raw_response = response,
648
+ )
649
+
650
+
651
+ def _load_llama_cpp_model(self):
652
+ """
653
+ Lazy load the Llama.cpp model
654
+ """
655
+ log_info("Loading Llama.cpp model", model_path=str(settings.LLAMA_CPP_MODEL_PATH))
656
+
657
+ # Ensure model exists, download if needed
658
+ if( not settings.LLAMA_CPP_MODEL_PATH.exists()):
659
+ self._download_llama_cpp_model()
660
+
661
+ # Load model with appropriate GPU layers / CPU loading
662
+ n_gpu_layers = settings.LLAMA_CPP_N_GPU_LAYERS
663
+
664
+ if settings.IS_HUGGINGFACE_SPACE:
665
+ n_gpu_layers = 0
666
+
667
+ self.llama_cpp_model = Llama(model_path = str(settings.LLAMA_CPP_MODEL_PATH),
668
+ n_ctx = settings.LLAMA_CPP_N_CTX,
669
+ n_gpu_layers = n_gpu_layers,
670
+ n_batch = settings.LLAMA_CPP_N_BATCH,
671
+ n_threads = settings.LLAMA_CPP_N_THREADS,
672
+ verbose = False,
673
+ )
674
+
675
+ log_info("Llama.cpp model loaded successfully")
676
+
677
+
678
+ def _download_llama_cpp_model(self):
679
+ """
680
+ Download GGUF model from HuggingFace Hub
681
+ """
682
+ log_info("Downloading GGUF model", repo = settings.LLAMA_CPP_MODEL_REPO, filename = settings.LLAMA_CPP_MODEL_FILE)
683
+
684
+ try:
685
+ from huggingface_hub import hf_hub_download
686
+
687
+ # Ensure cache directory exists
688
+ settings.MODEL_CACHE_DIR.mkdir(parents = True, exist_ok = True)
689
+
690
+ # Download the model
691
+ downloaded_path = hf_hub_download(repo_id = settings.LLAMA_CPP_MODEL_REPO,
692
+ filename = settings.LLAMA_CPP_MODEL_FILE,
693
+ cache_dir = str(settings.MODEL_CACHE_DIR),
694
+ force_download = False,
695
+ resume_download = True,
696
+ )
697
+
698
+ # Create symlink to expected path
699
+ if (downloaded_path != str(settings.LLAMA_CPP_MODEL_PATH)):
700
+ import shutil
701
+ shutil.copy(downloaded_path, settings.LLAMA_CPP_MODEL_PATH)
702
+
703
+ log_info("GGUF model downloaded successfully", path = str(settings.LLAMA_CPP_MODEL_PATH))
704
+
705
+ except Exception as e:
706
+ log_error(e, context = {"component" : "LLMManager",
707
+ "operation" : "download_llama_cpp_model",
708
+ "repo" : settings.LLAMA_CPP_MODEL_REPO,
709
+ "filename" : settings.LLAMA_CPP_MODEL_FILE,
710
+ }
711
+ )
712
+ raise
713
+
714
+
715
+ # HuggingFace Inference Provider
716
+ def _complete_hf_inference(self, prompt: str, model: Optional[str], temperature: float, max_tokens: int, system_prompt: Optional[str]) -> LLMResponse:
717
+ """
718
+ Complete using HuggingFace Inference API
719
+ """
720
+ if not settings.ENABLE_HF_INFERENCE or not self.hf_client:
721
+ raise ValueError("HF Inference is disabled or not configured")
722
+
723
+ start_time = time.time()
724
+
725
+ # Construct full prompt
726
+ full_prompt = f"""
727
+ {system_prompt or settings.LLM_SYSTEM_PROMPT}
728
+
729
+ {prompt}
730
+
731
+ Response:
732
+ """
733
+
734
+ log_info("Calling HuggingFace Inference API")
735
+
736
+ # Generate response
737
+ response = self.hf_client.text_generation(full_prompt,
738
+ max_new_tokens = max_tokens,
739
+ temperature = temperature,
740
+ do_sample = True,
741
+ return_full_text = False,
742
+ )
743
+
744
+ generated_text = response
745
+ latency = time.time() - start_time
746
+
747
+ # Rough token estimation
748
+ tokens_used = len(full_prompt.split()) + len(generated_text.split())
749
+
750
+ log_info("HF Inference completion successful",
751
+ tokens_used = tokens_used,
752
+ latency_seconds = round(latency, 3),
753
+ )
754
+
755
+ return LLMResponse(text = generated_text,
756
+ provider = "hf_inference",
757
+ model = settings.HF_MODEL_ID or "hf_inference",
758
+ tokens_used = tokens_used,
759
+ latency_seconds = latency,
760
+ success = True,
761
+ raw_response = {"text": generated_text},
762
+ )
763
+
764
+
765
  # Specialized Methods
766
  def generate_structured_json(self, prompt: str, schema_description: str, provider: Optional[LLMProvider] = None, **kwargs) -> Dict[str, Any]:
767
  """
 
813
  raise ValueError(f"Failed to parse JSON response: {e}")
814
 
815
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816
  # Utility Methods
817
  def get_provider_info(self, provider: LLMProvider) -> Dict[str, Any]:
818
  """
 
824
  }
825
 
826
  if (provider == LLMProvider.OLLAMA):
827
+ info["available"] = settings.ENABLE_OLLAMA and self._check_ollama_available()
828
 
829
  if info["available"]:
830
  info["models"] = self.list_ollama_models()
831
  info["base_url"] = self.ollama_base_url
832
 
833
  elif (provider == LLMProvider.OPENAI):
834
+ info["available"] = settings.ENABLE_OPENAI and OPENAI_AVAILABLE and bool(self.openai_api_key)
835
 
836
  if info["available"]:
837
+ info["models"] = [settings.OPENAI_MODEL, "gpt-4", "gpt-4-turbo-preview"]
 
 
 
838
 
839
  elif (provider == LLMProvider.ANTHROPIC):
840
+ info["available"] = settings.ENABLE_ANTHROPIC and ANTHROPIC_AVAILABLE and bool(self.anthropic_client)
841
 
842
  if info["available"]:
843
+ info["models"] = [settings.ANTHROPIC_MODEL, "claude-3-sonnet-20240229", "claude-3-opus-20240229"]
844
+
845
+ elif (provider == LLMProvider.LLAMA_CPP):
846
+ info["available"] = settings.ENABLE_LLAMA_CPP and LLAMA_CPP_AVAILABLE
847
+ info["model_path"] = str(settings.LLAMA_CPP_MODEL_PATH) if settings.LLAMA_CPP_MODEL_PATH else None
848
+ info["model_repo"] = settings.LLAMA_CPP_MODEL_REPO
849
+
850
+ elif (provider == LLMProvider.HF_INFER):
851
+ info["available"] = settings.ENABLE_HF_INFERENCE and self.hf_client is not None
852
+ info["model_id"] = settings.HF_MODEL_ID
853
 
854
  return info
855
 
 
872
  --------
873
  { float } : Estimated cost in USD
874
  """
875
+ # Local models (Ollama, Llama.cpp) are free
876
+ if provider in [LLMProvider.OLLAMA, LLMProvider.LLAMA_CPP, LLMProvider.HF_INFER]:
877
+ return 0.0
 
 
 
 
 
 
 
878
 
879
+ # Pricing per 1K tokens (as of 2025)
880
+ pricing = {"openai" : {"gpt-3.5-turbo" : {"prompt": 0.0015, "completion": 0.002},
881
+ "gpt-4" : {"prompt": 0.03, "completion": 0.06},
882
+ "gpt-4-turbo-preview" : {"prompt": 0.01, "completion": 0.03},
883
+ },
884
+ "anthropic" : {"claude-3-opus-20240229" : {"prompt": 0.015, "completion": 0.075},
885
+ "claude-3-sonnet-20240229" : {"prompt": 0.003, "completion": 0.015},
886
+ "claude-3-haiku-20240307" : {"prompt": 0.00025, "completion": 0.00125},
887
+ }
888
+ }
889
 
890
  provider_pricing = pricing.get(provider.value, {}).get(model)
891
 
892
  if not provider_pricing:
893
  return 0.0
894
 
895
+ cost = ((prompt_tokens / 1000) * provider_pricing["prompt"] + (completion_tokens / 1000) * provider_pricing["completion"])
896
 
897
+ return round(cost, 6)
requirements.txt CHANGED
@@ -37,6 +37,10 @@ requests>=2.31.0
37
  openai>=1.0.0
38
  anthropic>=0.5.0
39
 
 
 
 
 
40
  # Text Processing Utilities
41
  chardet>=5.0.0
42
  langdetect>=1.0.9
@@ -56,4 +60,7 @@ psutil>=5.9.5
56
  orjson>=3.9.0
57
 
58
  # For spaCy performance
59
- blis>=0.7.10
 
 
 
 
37
  openai>=1.0.0
38
  anthropic>=0.5.0
39
 
40
+ # For Huggingface Spaces
41
+ llama-cpp-python>=0.2.20 # For CPU-only GGUF models on HF Spaces
42
+ huggingface-hub>=0.19.0 # For downloading GGUF models
43
+
44
  # Text Processing Utilities
45
  chardet>=5.0.0
46
  langdetect>=1.0.9
 
60
  orjson>=3.9.0
61
 
62
  # For spaCy performance
63
+ blis>=0.7.10
64
+
65
+ # Additional for spaCy model
66
+ spacy-transformers>=1.2.0
services/llm_interpreter.py CHANGED
@@ -36,6 +36,7 @@ class LLMClauseInterpreter:
36
  Arguments:
37
  ----------
38
  llm_manager { LLMManager } : LLMManager instance
 
39
  default_provider { LLMProvider } : Default LLM provider to use
40
  """
41
  self.llm_manager = llm_manager
@@ -225,7 +226,7 @@ class LLMClauseInterpreter:
225
  provider = provider,
226
  temperature = 0.3,
227
  max_tokens = 1200,
228
- fallback_providers = [LLMProvider.OPENAI, LLMProvider.ANTHROPIC],
229
  )
230
 
231
  # Calculate negotiation priority
 
36
  Arguments:
37
  ----------
38
  llm_manager { LLMManager } : LLMManager instance
39
+
40
  default_provider { LLMProvider } : Default LLM provider to use
41
  """
42
  self.llm_manager = llm_manager
 
226
  provider = provider,
227
  temperature = 0.3,
228
  max_tokens = 1200,
229
+ fallback_providers = [LLMProvider.LLAMA_CPP, LLMProvider.OPENAI, LLMProvider.ANTHROPIC],
230
  )
231
 
232
  # Calculate negotiation priority
services/negotiation_engine.py CHANGED
@@ -503,7 +503,7 @@ class NegotiationEngine:
503
  provider = provider,
504
  temperature = 0.3,
505
  max_tokens = 2000,
506
- fallback_providers = [LLMProvider.OPENAI],
507
  retry_on_error = True,
508
  )
509
  if response.success:
 
503
  provider = provider,
504
  temperature = 0.3,
505
  max_tokens = 2000,
506
+ fallback_providers = [ LLMProvider.LLAMA_CPP, LLMProvider.OPENAI, LLMProvider.ANTHROPIC],
507
  retry_on_error = True,
508
  )
509
  if response.success:
services/summary_generator.py CHANGED
@@ -23,15 +23,23 @@ class SummaryGenerator:
23
  """
24
  LLM-powered executive summary generator for contract analysis : Generates professional, detailed executive summaries using ALL pipeline outputs
25
  """
26
- def __init__(self, llm_manager: Optional[LLMManager] = None):
27
  """
28
  Initialize the summary generator
29
 
30
  Arguments:
31
  ----------
32
- llm_manager { LLMManager } : LLM manager instance (if None, creates one with default settings)
 
 
33
  """
34
- self.llm_manager = llm_manager or LLMManager()
 
 
 
 
 
 
35
  self.logger = ContractAnalyzerLogger.get_logger()
36
 
37
  self.logger.info("Summary generator initialized")
@@ -39,7 +47,8 @@ class SummaryGenerator:
39
 
40
  # Main entry point with full pipeline integration
41
  def generate_executive_summary(self, contract_text: str, classification: ContractCategory, risk_analysis: RiskScore, risk_interpretation: RiskInterpretation,
42
- negotiation_playbook: NegotiationPlaybook, unfavorable_terms: List, missing_protections: List, clauses: List) -> str:
 
43
  """
44
  Generate executive summary using all the pipeline outputs
45
 
@@ -60,6 +69,8 @@ class SummaryGenerator:
60
  missing_protections { List } : Missing protections
61
 
62
  clauses { List } : Extracted clauses
 
 
63
 
64
  Returns:
65
  --------
@@ -78,7 +89,9 @@ class SummaryGenerator:
78
  )
79
 
80
  # Generate summary using LLM
81
- summary = self._generate_summary(context = context)
 
 
82
 
83
  self.logger.info(f"Executive summary generated - Risk: {context.risk_score}/100 ({context.risk_level})")
84
 
@@ -193,7 +206,7 @@ class SummaryGenerator:
193
  return findings
194
 
195
 
196
- def _generate_summary(self, context: SummaryContext) -> str:
197
  """
198
  Generate enhanced summary using comprehensive context
199
  """
@@ -203,6 +216,7 @@ class SummaryGenerator:
203
  try:
204
  response = self.llm_manager.complete(prompt = prompt,
205
  system_prompt = system_prompt,
 
206
  temperature = 0.3,
207
  max_tokens = 500,
208
  json_mode = False,
 
23
  """
24
  LLM-powered executive summary generator for contract analysis : Generates professional, detailed executive summaries using ALL pipeline outputs
25
  """
26
+ def __init__(self, llm_manager: Optional[LLMManager] = None, default_provider: Optional[LLMProvider] = None):
27
  """
28
  Initialize the summary generator
29
 
30
  Arguments:
31
  ----------
32
+ llm_manager { LLMManager } : LLM manager instance (if None, creates one with default settings)
33
+
34
+ default_provider { LLMProvider } : Default LLM provider to use if creating new LLMManager
35
  """
36
+ # Create LLMManager with the specified provider (or use default from settings)
37
+ if llm_manager is None:
38
+ self.llm_manager = LLMManager(default_provider = default_provider)
39
+
40
+ else:
41
+ self.llm_manager = llm_manager
42
+
43
  self.logger = ContractAnalyzerLogger.get_logger()
44
 
45
  self.logger.info("Summary generator initialized")
 
47
 
48
  # Main entry point with full pipeline integration
49
  def generate_executive_summary(self, contract_text: str, classification: ContractCategory, risk_analysis: RiskScore, risk_interpretation: RiskInterpretation,
50
+ negotiation_playbook: NegotiationPlaybook, unfavorable_terms: List, missing_protections: List, clauses: List,
51
+ provider: Optional[LLMProvider] = None) -> str:
52
  """
53
  Generate executive summary using all the pipeline outputs
54
 
 
69
  missing_protections { List } : Missing protections
70
 
71
  clauses { List } : Extracted clauses
72
+
73
+ provider { LLMProvide } : Optional LLM provider override
74
 
75
  Returns:
76
  --------
 
89
  )
90
 
91
  # Generate summary using LLM
92
+ summary = self._generate_summary(context = context,
93
+ provider = provider,
94
+ )
95
 
96
  self.logger.info(f"Executive summary generated - Risk: {context.risk_score}/100 ({context.risk_level})")
97
 
 
206
  return findings
207
 
208
 
209
+ def _generate_summary(self, context: SummaryContext, provider: Optional[LLMProvider] = None) -> str:
210
  """
211
  Generate enhanced summary using comprehensive context
212
  """
 
216
  try:
217
  response = self.llm_manager.complete(prompt = prompt,
218
  system_prompt = system_prompt,
219
+ provider = provider,
220
  temperature = 0.3,
221
  max_tokens = 500,
222
  json_mode = False,
utils/document_reader.py CHANGED
@@ -9,10 +9,6 @@ from typing import Union
9
  from pathlib import Path
10
  from docx import Document
11
  from typing import Optional
12
-
13
- # Add parent directory to path for imports
14
- #sys.path.append(str(Path(__file__).parent.parent))
15
-
16
  from config.settings import settings
17
 
18
 
 
9
  from pathlib import Path
10
  from docx import Document
11
  from typing import Optional
 
 
 
 
12
  from config.settings import settings
13
 
14