Spaces:
Sleeping
Sleeping
Commit
·
2473068
1
Parent(s):
b213f99
Fix: Handle None LLAMA_CPP_MODEL_PATH and pre-download model
Browse files- Dockerfile +13 -1
- config/settings.py +4 -4
- model_manager/llm_manager.py +7 -2
Dockerfile
CHANGED
|
@@ -5,6 +5,7 @@ ENV PIP_NO_CACHE_DIR=1
|
|
| 5 |
ENV DOCKER_CONTAINER=true
|
| 6 |
ENV SPACE_APP_DATA=/data
|
| 7 |
ENV HF_HOME=/data/huggingface
|
|
|
|
| 8 |
|
| 9 |
# Optimize llama-cpp-python build for CPU only
|
| 10 |
ENV CMAKE_ARGS="-DLLAMA_BLAS=0 -DLLAMA_CUBLAS=0"
|
|
@@ -37,6 +38,17 @@ RUN python -m spacy download en_core_web_sm
|
|
| 37 |
# Create directories that your app expects
|
| 38 |
RUN mkdir -p /data/models /data/uploads /data/cache /data/logs /data/huggingface
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
# Copy app code
|
| 41 |
COPY . .
|
| 42 |
|
|
@@ -46,7 +58,7 @@ RUN chmod -R 755 /app && \
|
|
| 46 |
|
| 47 |
# Health check
|
| 48 |
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
|
| 49 |
-
CMD curl -f http://localhost:7860/
|
| 50 |
|
| 51 |
EXPOSE 7860
|
| 52 |
|
|
|
|
| 5 |
ENV DOCKER_CONTAINER=true
|
| 6 |
ENV SPACE_APP_DATA=/data
|
| 7 |
ENV HF_HOME=/data/huggingface
|
| 8 |
+
ENV LLAMA_CPP_MODEL_PATH=/data/models/Hermes-2-Pro-Llama-3-8B-GGUF.Q4_K_M.gguf
|
| 9 |
|
| 10 |
# Optimize llama-cpp-python build for CPU only
|
| 11 |
ENV CMAKE_ARGS="-DLLAMA_BLAS=0 -DLLAMA_CUBLAS=0"
|
|
|
|
| 38 |
# Create directories that your app expects
|
| 39 |
RUN mkdir -p /data/models /data/uploads /data/cache /data/logs /data/huggingface
|
| 40 |
|
| 41 |
+
# Download GGUF model during build (BEFORE copying app code)
|
| 42 |
+
RUN python -c "from huggingface_hub import hf_hub_download; \
|
| 43 |
+
import shutil; \
|
| 44 |
+
downloaded = hf_hub_download( \
|
| 45 |
+
repo_id='NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF', \
|
| 46 |
+
filename='Hermes-2-Pro-Llama-3-8B-GGUF.Q4_K_M.gguf', \
|
| 47 |
+
cache_dir='/data/huggingface' \
|
| 48 |
+
); \
|
| 49 |
+
shutil.copy(downloaded, '/data/models/Hermes-2-Pro-Llama-3-8B-GGUF.Q4_K_M.gguf')" && \
|
| 50 |
+
echo "Model downloaded to /data/models/"
|
| 51 |
+
|
| 52 |
# Copy app code
|
| 53 |
COPY . .
|
| 54 |
|
|
|
|
| 58 |
|
| 59 |
# Health check
|
| 60 |
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
|
| 61 |
+
CMD curl -f http://localhost:7860/api/v1/health || exit 1
|
| 62 |
|
| 63 |
EXPOSE 7860
|
| 64 |
|
config/settings.py
CHANGED
|
@@ -91,10 +91,10 @@ class Settings(BaseSettings):
|
|
| 91 |
HF_API_TOKEN : Optional[str] = None # HF token for gated models
|
| 92 |
|
| 93 |
# LLM Generation Settings (Shared across providers)
|
| 94 |
-
LLM_TEMPERATURE : float = 0.1
|
| 95 |
-
LLM_MAX_TOKENS : int = 1024
|
| 96 |
-
LLM_TOP_P : float = 0.95
|
| 97 |
-
LLM_REPEAT_PENALTY : float = 1.1
|
| 98 |
LLM_SYSTEM_PROMPT : str = "You are a helpful legal assistant specializing in contract analysis and risk assessment."
|
| 99 |
|
| 100 |
# Analysis Limits
|
|
|
|
| 91 |
HF_API_TOKEN : Optional[str] = None # HF token for gated models
|
| 92 |
|
| 93 |
# LLM Generation Settings (Shared across providers)
|
| 94 |
+
LLM_TEMPERATURE : float = 0.1 # Default for all providers
|
| 95 |
+
LLM_MAX_TOKENS : int = 1024 # Default for all providers
|
| 96 |
+
LLM_TOP_P : float = 0.95 # Default top-p sampling
|
| 97 |
+
LLM_REPEAT_PENALTY : float = 1.1 # Default repeat penalty
|
| 98 |
LLM_SYSTEM_PROMPT : str = "You are a helpful legal assistant specializing in contract analysis and risk assessment."
|
| 99 |
|
| 100 |
# Analysis Limits
|
model_manager/llm_manager.py
CHANGED
|
@@ -652,10 +652,15 @@ class LLMManager:
|
|
| 652 |
"""
|
| 653 |
Lazy load the Llama.cpp model
|
| 654 |
"""
|
| 655 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 656 |
|
| 657 |
# Ensure model exists, download if needed
|
| 658 |
-
if(
|
| 659 |
self._download_llama_cpp_model()
|
| 660 |
|
| 661 |
# Load model with appropriate GPU layers / CPU loading
|
|
|
|
| 652 |
"""
|
| 653 |
Lazy load the Llama.cpp model
|
| 654 |
"""
|
| 655 |
+
# Handle None model path
|
| 656 |
+
if settings.LLAMA_CPP_MODEL_PATH is None:
|
| 657 |
+
settings.LLAMA_CPP_MODEL_PATH = settings.MODEL_CACHE_DIR / settings.LLAMA_CPP_MODEL_FILE
|
| 658 |
+
log_info(f"Model path was None, set to: {settings.LLAMA_CPP_MODEL_PATH}")
|
| 659 |
+
|
| 660 |
+
log_info("Loading Llama.cpp model", model_path = str(settings.LLAMA_CPP_MODEL_PATH))
|
| 661 |
|
| 662 |
# Ensure model exists, download if needed
|
| 663 |
+
if (not settings.LLAMA_CPP_MODEL_PATH.exists()):
|
| 664 |
self._download_llama_cpp_model()
|
| 665 |
|
| 666 |
# Load model with appropriate GPU layers / CPU loading
|