Spaces:

VIDraft
/

RAGOndevice

Running

App Files Files Community

cutechicken commited on Dec 17, 2024

Commit

fcd720a

verified ·

1 Parent(s): d6a3ccb

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -119

app.py CHANGED Viewed

@@ -128,6 +128,9 @@ def find_relevant_context(query, top_k=3):
     return relevant_contexts
 def analyze_file_content(content, file_type):
     """Analyze file content and return structural summary"""
     if file_type in ['parquet', 'csv']:
@@ -136,9 +139,9 @@ def analyze_file_content(content, file_type):
             header = lines[0]
             columns = header.count('|') - 1
             rows = len(lines) - 3
-            return f"📊 데이터셋 구조: {columns}개 컬럼, {rows}개 데이터"
         except:
-            return "❌ 데이터셋 구조 분석 실패"
     lines = content.split('\n')
     total_lines = len(lines)
@@ -148,51 +151,11 @@ def analyze_file_content(content, file_type):
         functions = len([line for line in lines if 'def ' in line])
         classes = len([line for line in lines if 'class ' in line])
         imports = len([line for line in lines if 'import ' in line or 'from ' in line])
-        return f"💻 코드 구조: {total_lines}줄 (함수: {functions}, 클래스: {classes}, 임포트: {imports})"
     paragraphs = content.count('\n\n') + 1
     words = len(content.split())
-    return f"📝 문서 구조: {total_lines}줄, {paragraphs}단락, 약 {words}단어"
-def extract_pdf_text_with_ocr(file_path):
-    try:
-        # Poppler 경로 설정
-        if platform.system() == 'Windows':
-            poppler_path = r"C:\Program Files\poppler-0.68.0\bin"
-        else:
-            poppler_path = None  # Linux의 경우 기본 경로 사용
-        # PDF를 이미지로 변환
-        images = convert_from_path(
-            file_path,
-            poppler_path=poppler_path,
-            fmt='jpeg',
-            grayscale=False,
-            size=(1700, None)  # 해상도 향상
-        )
-        # 전체 텍스트 저장
-        text = ""
-        # 각 페이지에 대해 OCR 수행
-        for i, image in enumerate(images):
-            try:
-                # OCR 설정
-                custom_config = r'--oem 3 --psm 6 -l kor+eng'
-                # OCR 수행
-                page_text = pytesseract.image_to_string(
-                    image,
-                    config=custom_config
-                )
-                text += f"\n--- 페이지 {i+1} ---\n{page_text}\n"
-            except Exception as e:
-                print(f"페이지 {i+1} OCR 오류: {str(e)}")
-                continue
-        return text
-    except Exception as e:
-        return f"PDF 텍스트 추출 오류: {str(e)}"
 def read_uploaded_file(file):
     if file is None:
@@ -200,62 +163,56 @@ def read_uploaded_file(file):
     try:
         file_ext = os.path.splitext(file.name)[1].lower()
-        # Parquet 파일 처리
         if file_ext == '.parquet':
             try:
                 table = pq.read_table(file.name)
                 df = table.to_pandas()
-                content = f"📊 Parquet 파일 분석:\n\n"
-                content += f"1. 기본 정보:\n"
-                content += f"- 전체 행 수: {len(df):,}개\n"
-                content += f"- 전체 열 수: {len(df.columns)}개\n"
-                content += f"- 메모리 사용량: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB\n\n"
-                content += f"2. 컬럼 정보:\n"
                 for col in df.columns:
                     content += f"- {col} ({df[col].dtype})\n"
-                content += f"\n3. 데이터 미리보기:\n"
-                # tabulate 사용하여 테이블 형식으로 출력
                 content += tabulate(df.head(5), headers='keys', tablefmt='pipe', showindex=False)
-                content += f"\n\n4. 결측치 정보:\n"
                 null_counts = df.isnull().sum()
                 for col, count in null_counts[null_counts > 0].items():
-                    content += f"- {col}: {count:,}개 ({count/len(df)*100:.1f}%)\n"
-                # 수치형 컬럼에 대한 기본 통계
                 numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
                 if len(numeric_cols) > 0:
-                    content += f"\n5. 수치형 컬럼 통계:\n"
                     stats_df = df[numeric_cols].describe()
                     content += tabulate(stats_df, headers='keys', tablefmt='pipe')
                 return content, "parquet"
             except Exception as e:
-                return f"Parquet 파일 읽기 오류: {str(e)}", "error"
-        # PDF 파일 처리
         if file_ext == '.pdf':
             try:
                 pdf_reader = pypdf.PdfReader(file.name)
                 total_pages = len(pdf_reader.pages)
-                content = f"📑 PDF 문서 분석:\n\n"
-                content += f"1. 기본 정보:\n"
-                content += f"- 총 페이지 수: {total_pages}페이지\n"
-                # 메타데이터 추출
                 if pdf_reader.metadata:
-                    content += "\n2. 메타데이터:\n"
                     for key, value in pdf_reader.metadata.items():
                         if value and str(key).startswith('/'):
                             content += f"- {key[1:]}: {value}\n"
-                # 먼저 pdfminer로 텍스트 추출 시도
                 try:
                     text = extract_text(
                         file.name,
@@ -269,117 +226,101 @@ def read_uploaded_file(file):
                 except:
                     text = ""
-                # pdfminer로 추출 실패시 OCR 시도
                 if not text.strip():
                     text = extract_pdf_text_with_ocr(file.name)
-                # 텍스트 분석
                 if text:
                     words = text.split()
                     lines = text.split('\n')
-                    content += f"\n3. 텍스트 분석:\n"
-                    content += f"- 총 단어 수: {len(words):,}개\n"
-                    content += f"- 고유 단어 수: {len(set(words)):,}개\n"
-                    content += f"- 총 라인 수: {len(lines):,}개\n"
-                    # 본문 내용
-                    content += f"\n4. 본문 내용:\n"
-                    preview_length = min(2000, len(text))  # 미리보기 길이 증가
-                    content += f"--- 처음 {preview_length}자 ---\n"
                     content += text[:preview_length]
                     if len(text) > preview_length:
-                        content += f"\n... (총 {len(text):,}자 중 일부 표시)\n"
                 else:
-                    content += "\n⚠️ 텍스트 추출 실패"
                 return content, "pdf"
             except Exception as e:
-                return f"PDF 파일 읽기 오류: {str(e)}", "error"
-        # CSV 파일 처리
         elif file_ext == '.csv':
             encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
             for encoding in encodings:
                 try:
                     df = pd.read_csv(file.name, encoding=encoding)
-                    content = f"📊 CSV 파일 분석:\n\n"
-                    content += f"1. 기본 정보:\n"
-                    content += f"- 전체 행 수: {len(df):,}개\n"
-                    content += f"- 전체 열 수: {len(df.columns)}개\n"
-                    content += f"- 메모리 사용량: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB\n\n"
-                    content += f"2. 컬럼 정보:\n"
                     for col in df.columns:
                         content += f"- {col} ({df[col].dtype})\n"
-                    content += f"\n3. 데이터 미리보기:\n"
                     content += df.head(5).to_markdown(index=False)
-                    content += f"\n\n4. 결측치 정보:\n"
                     null_counts = df.isnull().sum()
                     for col, count in null_counts[null_counts > 0].items():
-                        content += f"- {col}: {count:,}개 ({count/len(df)*100:.1f}%)\n"
                     return content, "csv"
                 except UnicodeDecodeError:
                     continue
-            raise UnicodeDecodeError(f"지원되는 인코딩으로 파일을 읽을 수 없습니다 ({', '.join(encodings)})")
-        # 텍스트 파일 처리
         else:
             encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
             for encoding in encodings:
                 try:
                     with open(file.name, 'r', encoding=encoding) as f:
                         content = f.read()
-                    # 파일 내용 분석
                     lines = content.split('\n')
                     total_lines = len(lines)
                     non_empty_lines = len([line for line in lines if line.strip()])
-                    # 코드 파일 여부 확인
                     is_code = any(keyword in content.lower() for keyword in ['def ', 'class ', 'import ', 'function'])
-                    analysis = f"\n📝 파일 분석:\n"
                     if is_code:
-                        # 코드 파일 분석
                         functions = len([line for line in lines if 'def ' in line])
                         classes = len([line for line in lines if 'class ' in line])
                         imports = len([line for line in lines if 'import ' in line or 'from ' in line])
-                        analysis += f"- 파일 유형: 코드\n"
-                        analysis += f"- 전체 라인 수: {total_lines:,}줄\n"
-                        analysis += f"- 함수 수: {functions}개\n"
-                        analysis += f"- 클래스 수: {classes}개\n"
-                        analysis += f"- import 문 수: {imports}개\n"
                     else:
-                        # 일반 텍스트 파일 분석
                         words = len(content.split())
                         chars = len(content)
-                        analysis += f"- 파일 유형: 텍스트\n"
-                        analysis += f"- 전체 라인 수: {total_lines:,}줄\n"
-                        analysis += f"- 실제 내용이 있는 라인 수: {non_empty_lines:,}줄\n"
-                        analysis += f"- 단어 수: {words:,}개\n"
-                        analysis += f"- 문자 수: {chars:,}개\n"
                     return content + analysis, "text"
                 except UnicodeDecodeError:
                     continue
-            raise UnicodeDecodeError(f"지원되는 인코딩으로 파일을 읽을 수 없습니다 ({', '.join(encodings)})")
     except Exception as e:
-        return f"파일 읽기 오류: {str(e)}", "error"
-# 파일 업로드 이벤트 핸들링 수정
-def init_msg():
-    return "파일을 분석하고 있습니다..."
 CSS = """

     return relevant_contexts
+def init_msg():
+    return "Analyzing file..."
 def analyze_file_content(content, file_type):
     """Analyze file content and return structural summary"""
     if file_type in ['parquet', 'csv']:
             header = lines[0]
             columns = header.count('|') - 1
             rows = len(lines) - 3
+            return f"📊 Dataset Structure: {columns} columns, {rows} rows"
         except:
+            return "❌ Failed to analyze dataset structure"
     lines = content.split('\n')
     total_lines = len(lines)
         functions = len([line for line in lines if 'def ' in line])
         classes = len([line for line in lines if 'class ' in line])
         imports = len([line for line in lines if 'import ' in line or 'from ' in line])
+        return f"💻 Code Structure: {total_lines} lines (Functions: {functions}, Classes: {classes}, Imports: {imports})"
     paragraphs = content.count('\n\n') + 1
     words = len(content.split())
+    return f"📝 Document Structure: {total_lines} lines, {paragraphs} paragraphs, approximately {words} words"
 def read_uploaded_file(file):
     if file is None:
     try:
         file_ext = os.path.splitext(file.name)[1].lower()
+        # Parquet file processing
         if file_ext == '.parquet':
             try:
                 table = pq.read_table(file.name)
                 df = table.to_pandas()
+                content = f"📊 Parquet File Analysis:\n\n"
+                content += f"1. Basic Information:\n"
+                content += f"- Total Rows: {len(df):,}\n"
+                content += f"- Total Columns: {len(df.columns)}\n"
+                content += f"- Memory Usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB\n\n"
+                content += f"2. Column Information:\n"
                 for col in df.columns:
                     content += f"- {col} ({df[col].dtype})\n"
+                content += f"\n3. Data Preview:\n"
                 content += tabulate(df.head(5), headers='keys', tablefmt='pipe', showindex=False)
+                content += f"\n\n4. Missing Values:\n"
                 null_counts = df.isnull().sum()
                 for col, count in null_counts[null_counts > 0].items():
+                    content += f"- {col}: {count:,} ({count/len(df)*100:.1f}%)\n"
                 numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
                 if len(numeric_cols) > 0:
+                    content += f"\n5. Numeric Column Statistics:\n"
                     stats_df = df[numeric_cols].describe()
                     content += tabulate(stats_df, headers='keys', tablefmt='pipe')
                 return content, "parquet"
             except Exception as e:
+                return f"Error reading Parquet file: {str(e)}", "error"
+        # PDF file processing
         if file_ext == '.pdf':
             try:
                 pdf_reader = pypdf.PdfReader(file.name)
                 total_pages = len(pdf_reader.pages)
+                content = f"📑 PDF Document Analysis:\n\n"
+                content += f"1. Basic Information:\n"
+                content += f"- Total Pages: {total_pages}\n"
                 if pdf_reader.metadata:
+                    content += "\n2. Metadata:\n"
                     for key, value in pdf_reader.metadata.items():
                         if value and str(key).startswith('/'):
                             content += f"- {key[1:]}: {value}\n"
                 try:
                     text = extract_text(
                         file.name,
                 except:
                     text = ""
                 if not text.strip():
                     text = extract_pdf_text_with_ocr(file.name)
                 if text:
                     words = text.split()
                     lines = text.split('\n')
+                    content += f"\n3. Text Analysis:\n"
+                    content += f"- Total Words: {len(words):,}\n"
+                    content += f"- Unique Words: {len(set(words)):,}\n"
+                    content += f"- Total Lines: {len(lines):,}\n"
+                    content += f"\n4. Content Preview:\n"
+                    preview_length = min(2000, len(text))
+                    content += f"--- First {preview_length} characters ---\n"
                     content += text[:preview_length]
                     if len(text) > preview_length:
+                        content += f"\n... (Showing partial content of {len(text):,} characters)\n"
                 else:
+                    content += "\n⚠️ Text extraction failed"
                 return content, "pdf"
             except Exception as e:
+                return f"Error reading PDF file: {str(e)}", "error"
+        # CSV file processing
         elif file_ext == '.csv':
             encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
             for encoding in encodings:
                 try:
                     df = pd.read_csv(file.name, encoding=encoding)
+                    content = f"📊 CSV File Analysis:\n\n"
+                    content += f"1. Basic Information:\n"
+                    content += f"- Total Rows: {len(df):,}\n"
+                    content += f"- Total Columns: {len(df.columns)}\n"
+                    content += f"- Memory Usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB\n\n"
+                    content += f"2. Column Information:\n"
                     for col in df.columns:
                         content += f"- {col} ({df[col].dtype})\n"
+                    content += f"\n3. Data Preview:\n"
                     content += df.head(5).to_markdown(index=False)
+                    content += f"\n\n4. Missing Values:\n"
                     null_counts = df.isnull().sum()
                     for col, count in null_counts[null_counts > 0].items():
+                        content += f"- {col}: {count:,} ({count/len(df)*100:.1f}%)\n"
                     return content, "csv"
                 except UnicodeDecodeError:
                     continue
+            raise UnicodeDecodeError(f"Unable to read file with supported encodings ({', '.join(encodings)})")
+        # Text file processing
         else:
             encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
             for encoding in encodings:
                 try:
                     with open(file.name, 'r', encoding=encoding) as f:
                         content = f.read()
                     lines = content.split('\n')
                     total_lines = len(lines)
                     non_empty_lines = len([line for line in lines if line.strip()])
                     is_code = any(keyword in content.lower() for keyword in ['def ', 'class ', 'import ', 'function'])
+                    analysis = f"\n📝 File Analysis:\n"
                     if is_code:
                         functions = len([line for line in lines if 'def ' in line])
                         classes = len([line for line in lines if 'class ' in line])
                         imports = len([line for line in lines if 'import ' in line or 'from ' in line])
+                        analysis += f"- File Type: Code\n"
+                        analysis += f"- Total Lines: {total_lines:,}\n"
+                        analysis += f"- Functions: {functions}\n"
+                        analysis += f"- Classes: {classes}\n"
+                        analysis += f"- Import Statements: {imports}\n"
                     else:
                         words = len(content.split())
                         chars = len(content)
+                        analysis += f"- File Type: Text\n"
+                        analysis += f"- Total Lines: {total_lines:,}\n"
+                        analysis += f"- Non-empty Lines: {non_empty_lines:,}\n"
+                        analysis += f"- Word Count: {words:,}\n"
+                        analysis += f"- Character Count: {chars:,}\n"
                     return content + analysis, "text"
                 except UnicodeDecodeError:
                     continue
+            raise UnicodeDecodeError(f"Unable to read file with supported encodings ({', '.join(encodings)})")
     except Exception as e:
+        return f"Error reading file: {str(e)}", "error"
 CSS = """