Spaces:

ResearchEngineering
/

financial_news_bot

Sleeping

App Files Files Community

Dmitry Beresnev commited on 7 days ago

Commit

5260ec0

1 Parent(s): 37c39e5

add cache for the downloaded data

Browse files

Files changed (1) hide show

src/core/ticker_scanner/parallel_data_downloader.py +128 -20

src/core/ticker_scanner/parallel_data_downloader.py CHANGED Viewed

@@ -2,12 +2,14 @@
 parallel_yf_downloader.py
 Parallel downloading of ticker historical prices using multiprocessing,
 with retry and rate-limit handling and batching.
 """
 import time
 import random
 from itertools import islice
-from typing import Any
 from concurrent.futures import ProcessPoolExecutor, as_completed
 import yfinance as yf
@@ -22,13 +24,75 @@ MAX_RETRIES = 3                 # Retry count on failure
 SLEEP_BETWEEN_RETRIES = 1.0     # Seconds between retries
 BATCH_SIZE = 50                 # Number of tickers per batch
 MIN_DATA_POINTS = 50            # Minimum number of price points required
-def fetch_prices(ticker: str, max_retries: int = MAX_RETRIES) -> dict[str, Any]:
     """
     Download all-time closing prices for a single ticker safely.
-    Returns dict {'ticker': ticker, 'prices': ndarray, 'dates': DatetimeIndex} or None if failed.
     """
     for attempt in range(max_retries):
         try:
             df = yf.download(ticker, period="max", progress=False, auto_adjust=True)
@@ -59,11 +123,18 @@ def fetch_prices(ticker: str, max_retries: int = MAX_RETRIES) -> dict[str, Any]:
             if prices.ndim > 1:
                 prices = prices.flatten()
-            return {
                 "ticker": ticker,
                 "prices": prices,
                 "dates": dates
             }
         except yf.shared.YFRateLimitError:
             wait = SLEEP_BETWEEN_RETRIES + random.random()
             logger.warning(f"Rate limited for {ticker}. Waiting {wait:.1f}s and retrying...")
@@ -84,25 +155,55 @@ def batch(iterable: list[str], n: int = BATCH_SIZE):
             break
         yield chunk
-def download_tickers_parallel(tickers: list[str], max_workers: int = MAX_WORKERS) -> list[dict[str, Any]]:
     """
     Download a large list of tickers in parallel batches.
-    Returns a list of {'ticker': ..., 'prices': ..., 'dates': ...} dicts.
     """
-    all_results = []
     all_failed = []
-    for batch_num, ticker_batch in enumerate(batch(tickers, BATCH_SIZE), start=1):
-        logger.info(f"Processing batch {batch_num}: {len(ticker_batch)} tickers")
-        results, failed = process_batch(ticker_batch, max_workers)
-        all_results.extend(results)
-        all_failed.extend(failed)
-        # small sleep between batches to reduce rate-limit chance
-        time.sleep(1 + random.random())
-    logger.info(f"Total downloaded: {len(all_results)}")
     if all_failed:
         logger.warning(f"Total failed: {len(all_failed)} - {all_failed[:10]}")  # Show first 10
     return all_results
 def process_batch(ticker_batch: list[str], max_workers: int) -> tuple[list[dict[str, Any]], list[Any]]:
@@ -127,22 +228,29 @@ def process_batch(ticker_batch: list[str], max_workers: int) -> tuple[list[dict[
     return results, failed
 def run_parallel_data_downloader(exchange: StockExchange = StockExchange.NASDAQ,
-                                 limit: int = 200) -> list[dict[str, Any]]:
     """
-    Main function to download ticker data in parallel.
     Args:
         exchange: Stock exchange to download from
         limit: Maximum number of tickers to download
     Returns:
         List of dicts with ticker, prices, and dates
     """
     all_tickers = TickersProvider().get_tickers(exchange)
     tickers = all_tickers[:limit]
-    logger.info(f"Starting parallel download for {len(tickers)} tickers from {exchange.value}...")
-    data = download_tickers_parallel(tickers)
-    logger.info(f"Downloaded {len(data)} tickers successfully")
     return data

 parallel_yf_downloader.py
 Parallel downloading of ticker historical prices using multiprocessing,
 with retry and rate-limit handling and batching.
+Includes in-memory caching with 2-hour expiry.
 """
 import time
 import random
 from itertools import islice
+from typing import Any, Optional
+from datetime import datetime, timedelta
 from concurrent.futures import ProcessPoolExecutor, as_completed
 import yfinance as yf
 SLEEP_BETWEEN_RETRIES = 1.0     # Seconds between retries
 BATCH_SIZE = 50                 # Number of tickers per batch
 MIN_DATA_POINTS = 50            # Minimum number of price points required
+CACHE_EXPIRY_HOURS = 2          # Cache expiry time in hours
+# In-memory cache for ticker data
+_ticker_cache: dict[str, dict[str, Any]] = {}
+_cache_timestamps: dict[str, datetime] = {}
+def _is_cache_valid(ticker: str) -> bool:
+    """Check if cached data for ticker is still valid (not expired)"""
+    if ticker not in _cache_timestamps:
+        return False
+    cache_age = datetime.now() - _cache_timestamps[ticker]
+    return cache_age < timedelta(hours=CACHE_EXPIRY_HOURS)
+def _get_cached_data(ticker: str) -> Optional[dict[str, Any]]:
+    """Get cached data if valid, None otherwise"""
+    if _is_cache_valid(ticker):
+        logger.debug(f"Using cached data for {ticker}")
+        return _ticker_cache.get(ticker)
+    return None
+def _cache_data(ticker: str, data: dict[str, Any]) -> None:
+    """Cache ticker data with current timestamp"""
+    _ticker_cache[ticker] = data
+    _cache_timestamps[ticker] = datetime.now()
+    logger.debug(f"Cached data for {ticker}")
+def clear_cache() -> None:
+    """Clear all cached data (useful for testing or manual refresh)"""
+    global _ticker_cache, _cache_timestamps
+    _ticker_cache.clear()
+    _cache_timestamps.clear()
+    logger.info("Cache cleared")
+def get_cache_stats() -> dict[str, Any]:
+    """Get cache statistics"""
+    valid_count = sum(1 for ticker in _ticker_cache.keys() if _is_cache_valid(ticker))
+    return {
+        'total_cached': len(_ticker_cache),
+        'valid_cached': valid_count,
+        'expired_cached': len(_ticker_cache) - valid_count
+    }
+def fetch_prices(ticker: str, max_retries: int = MAX_RETRIES, use_cache: bool = True) -> dict[str, Any]:
     """
     Download all-time closing prices for a single ticker safely.
+    Uses in-memory cache if available and not expired.
+    Args:
+        ticker: Stock ticker symbol
+        max_retries: Maximum number of retry attempts
+        use_cache: Whether to use cached data if available
+    Returns:
+        dict {'ticker': ticker, 'prices': ndarray, 'dates': DatetimeIndex} or None if failed
     """
+    # Check cache first
+    if use_cache:
+        cached_data = _get_cached_data(ticker)
+        if cached_data is not None:
+            return cached_data
+    # Download fresh data
     for attempt in range(max_retries):
         try:
             df = yf.download(ticker, period="max", progress=False, auto_adjust=True)
             if prices.ndim > 1:
                 prices = prices.flatten()
+            result = {
                 "ticker": ticker,
                 "prices": prices,
                 "dates": dates
             }
+            # Cache the result
+            if use_cache:
+                _cache_data(ticker, result)
+            return result
         except yf.shared.YFRateLimitError:
             wait = SLEEP_BETWEEN_RETRIES + random.random()
             logger.warning(f"Rate limited for {ticker}. Waiting {wait:.1f}s and retrying...")
             break
         yield chunk
+def download_tickers_parallel(tickers: list[str], max_workers: int = MAX_WORKERS,
+                            use_cache: bool = True) -> list[dict[str, Any]]:
     """
     Download a large list of tickers in parallel batches.
+    Uses in-memory cache to avoid re-downloading recently fetched data.
+    Args:
+        tickers: List of ticker symbols to download
+        max_workers: Number of parallel workers
+        use_cache: Whether to use cached data
+    Returns:
+        List of {'ticker': ..., 'prices': ..., 'dates': ...} dicts
     """
+    # Separate cached and non-cached tickers
+    cached_results = []
+    tickers_to_download = []
+    if use_cache:
+        for ticker in tickers:
+            cached_data = _get_cached_data(ticker)
+            if cached_data:
+                cached_results.append(cached_data)
+            else:
+                tickers_to_download.append(ticker)
+        if cached_results:
+            logger.info(f"Using cached data for {len(cached_results)} tickers")
+    else:
+        tickers_to_download = tickers
+    # Download remaining tickers
+    all_results = cached_results.copy()
     all_failed = []
+    if tickers_to_download:
+        logger.info(f"Downloading {len(tickers_to_download)} tickers...")
+        for batch_num, ticker_batch in enumerate(batch(tickers_to_download, BATCH_SIZE), start=1):
+            logger.info(f"Processing batch {batch_num}: {len(ticker_batch)} tickers")
+            results, failed = process_batch(ticker_batch, max_workers)
+            all_results.extend(results)
+            all_failed.extend(failed)
+            # small sleep between batches to reduce rate-limit chance
+            time.sleep(1 + random.random())
+    logger.info(f"Total available: {len(all_results)} (cached: {len(cached_results)}, downloaded: {len(all_results) - len(cached_results)})")
     if all_failed:
         logger.warning(f"Total failed: {len(all_failed)} - {all_failed[:10]}")  # Show first 10
     return all_results
 def process_batch(ticker_batch: list[str], max_workers: int) -> tuple[list[dict[str, Any]], list[Any]]:
     return results, failed
 def run_parallel_data_downloader(exchange: StockExchange = StockExchange.NASDAQ,
+                                 limit: int = 200,
+                                 use_cache: bool = True) -> list[dict[str, Any]]:
     """
+    Main function to download ticker data in parallel with caching.
     Args:
         exchange: Stock exchange to download from
         limit: Maximum number of tickers to download
+        use_cache: Whether to use cached data (expires after 2 hours)
     Returns:
         List of dicts with ticker, prices, and dates
     """
     all_tickers = TickersProvider().get_tickers(exchange)
     tickers = all_tickers[:limit]
+    # Log cache stats
+    cache_stats = get_cache_stats()
+    logger.info(f"Cache stats: {cache_stats['valid_cached']} valid, {cache_stats['expired_cached']} expired")
+    logger.info(f"Starting download for {len(tickers)} tickers from {exchange.value}...")
+    data = download_tickers_parallel(tickers, use_cache=use_cache)
+    logger.info(f"Retrieved {len(data)} tickers successfully")
     return data