Starburst15 commited on
Commit
90e0941
·
verified ·
1 Parent(s): 793855f

Create utils.py

Browse files
Files changed (1) hide show
  1. src/utils.py +183 -0
src/utils.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import pandas as pd
4
+ import numpy as np
5
+ from io import StringIO
6
+ from huggingface_hub import InferenceClient
7
+ import google.generativeai as genai
8
+
9
+ # ======================================================
10
+ # 🔧 HELPER FUNCTIONS
11
+ # ======================================================
12
+
13
+ def safe_hf_generate(client, prompt, temperature=0.3, max_tokens=512, retries=2):
14
+ """Safely call Hugging Face text generation with retry and graceful fallback."""
15
+ for attempt in range(retries + 1):
16
+ try:
17
+ resp = client.text_generation(
18
+ prompt,
19
+ temperature=temperature,
20
+ max_new_tokens=max_tokens,
21
+ return_full_text=False,
22
+ )
23
+ return resp.strip()
24
+ except Exception as e:
25
+ err = str(e)
26
+ if "503" in err or "Service Temporarily Unavailable" in err:
27
+ time.sleep(2)
28
+ if attempt < retries:
29
+ continue
30
+ else:
31
+ return "⚠️ The Hugging Face model is temporarily unavailable. Please try again later."
32
+ elif "Supported task: conversational" in err:
33
+ chat_resp = client.chat_completion(
34
+ messages=[{"role": "user", "content": prompt}],
35
+ max_tokens=max_tokens,
36
+ temperature=temperature,
37
+ )
38
+ return chat_resp["choices"][0]["message"]["content"].strip()
39
+ else:
40
+ raise e
41
+ return "⚠️ Failed after multiple retries."
42
+
43
+ # ======================================================
44
+ # 🧼 DATA CLEANING
45
+ # ======================================================
46
+
47
+ def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
48
+ """Perform a basic fallback cleaning if AI-based cleaning fails."""
49
+ df = df.copy()
50
+ df.dropna(axis=1, how="all", inplace=True)
51
+ df.columns = [c.strip().replace(" ", "_").lower() for c in df.columns]
52
+
53
+ for col in df.columns:
54
+ if df[col].dtype == "O":
55
+ if not df[col].mode().empty:
56
+ df[col].fillna(df[col].mode()[0], inplace=True)
57
+ else:
58
+ df[col].fillna("Unknown", inplace=True)
59
+ else:
60
+ df[col].fillna(df[col].median(), inplace=True)
61
+
62
+ df.drop_duplicates(inplace=True)
63
+ return df
64
+
65
+
66
+ def ai_clean_dataset(df: pd.DataFrame, cleaner_client: InferenceClient) -> (pd.DataFrame, str):
67
+ """Clean dataset intelligently using the chosen Hugging Face model."""
68
+ if len(df) > 50:
69
+ return df, "⚠️ AI cleaning skipped: dataset has more than 50 rows."
70
+
71
+ csv_text = df.to_csv(index=False)
72
+ prompt = f"""
73
+ You are a professional data cleaning assistant.
74
+ Clean and standardize the dataset below dynamically:
75
+ 1. Handle missing values
76
+ 2. Fix column name inconsistencies
77
+ 3. Convert data types (dates, numbers, categories)
78
+ 4. Remove irrelevant or duplicate rows
79
+ Return ONLY a valid CSV text (no markdown, no explanations).
80
+
81
+ Dataset:
82
+ {csv_text}
83
+ """
84
+ try:
85
+ cleaned_str = safe_hf_generate(cleaner_client, prompt, temperature=0.1, max_tokens=4096)
86
+ cleaned_str = cleaned_str.replace("```csv", "").replace("```", "").strip()
87
+ cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
88
+ cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
89
+ return cleaned_df, "✅ AI cleaning completed successfully."
90
+ except Exception as e:
91
+ return fallback_clean(df), f"⚠️ AI cleaning failed, used fallback cleaning instead: {str(e)}"
92
+
93
+ # ======================================================
94
+ # 📊 DATA SUMMARIZATION
95
+ # ======================================================
96
+
97
+ def summarize_for_analysis(df: pd.DataFrame, sample_rows: int = 10) -> str:
98
+ """Generate a concise textual summary of the dataset for AI models."""
99
+ summary = [f"Rows: {len(df)}, Columns: {len(df.columns)}"]
100
+
101
+ for col in df.columns:
102
+ non_null = int(df[col].notnull().sum())
103
+ if pd.api.types.is_numeric_dtype(df[col]):
104
+ desc = df[col].describe().to_dict()
105
+ summary.append(
106
+ f"- {col}: mean={desc.get('mean', np.nan):.2f}, median={df[col].median():.2f}, non_null={non_null}"
107
+ )
108
+ else:
109
+ top = df[col].value_counts().head(3).to_dict()
110
+ summary.append(f"- {col}: top_values={top}, non_null={non_null}")
111
+
112
+ sample = df.head(sample_rows).to_csv(index=False)
113
+ summary.append("--- Sample Data ---")
114
+ summary.append(sample)
115
+ return "\n".join(summary)
116
+
117
+ # ======================================================
118
+ # 🧠 ANALYSIS LOGIC
119
+ # ======================================================
120
+
121
+ def query_analysis_model(
122
+ df: pd.DataFrame,
123
+ user_query: str,
124
+ dataset_name: str,
125
+ analyst_model: str,
126
+ hf_client: InferenceClient = None,
127
+ temperature: float = 0.3,
128
+ max_tokens: int = 1024,
129
+ gemini_api_key: str = None
130
+ ) -> str:
131
+ """Query the selected AI model (Hugging Face or Gemini) to analyze the dataset."""
132
+ prompt_summary = summarize_for_analysis(df)
133
+ prompt = f"""
134
+ You are a professional data analyst.
135
+ Analyze the dataset '{dataset_name}' and answer the user's question.
136
+
137
+ --- DATA SUMMARY ---
138
+ {prompt_summary}
139
+
140
+ --- USER QUESTION ---
141
+ {user_query}
142
+
143
+ Respond with:
144
+ 1. Key insights and patterns
145
+ 2. Quantitative findings
146
+ 3. Notable relationships or anomalies
147
+ 4. Data-driven recommendations
148
+ """
149
+
150
+ try:
151
+ if analyst_model == "Gemini 2.5 Flash (Google)":
152
+ if not gemini_api_key:
153
+ return "⚠️ Gemini API key missing. Cannot use Gemini."
154
+ genai.configure(api_key=gemini_api_key)
155
+ response = genai.GenerativeModel("gemini-2.5-flash").generate_content(
156
+ prompt,
157
+ generation_config={
158
+ "temperature": temperature,
159
+ "max_output_tokens": max_tokens
160
+ }
161
+ )
162
+ return response.text if hasattr(response, "text") else "No valid text response."
163
+
164
+ # Otherwise, use Hugging Face model
165
+ result = safe_hf_generate(hf_client, prompt, temperature=temperature, max_tokens=max_tokens)
166
+
167
+ # fallback to Gemini if Hugging Face fails
168
+ if "temporarily unavailable" in result.lower() and gemini_api_key:
169
+ genai.configure(api_key=gemini_api_key)
170
+ alt = genai.GenerativeModel("gemini-2.5-flash").generate_content(prompt)
171
+ return f"🔄 Fallback to Gemini:\n\n{alt.text}"
172
+ return result
173
+
174
+ except Exception as e:
175
+ if "503" in str(e) and gemini_api_key:
176
+ genai.configure(api_key=gemini_api_key)
177
+ response = genai.GenerativeModel("gemini-2.5-flash").generate_content(prompt)
178
+ return f"🔄 Fallback to Gemini due to 503 error:\n\n{response.text}"
179
+ return f"⚠️ Analysis failed: {str(e)}"
180
+
181
+ # ======================================================
182
+ # 🔍 END OF MODULE
183
+ # ======================================================