Spaces:

vydrking
/

ITMO-QA

Sleeping

File size: 8,891 Bytes

c204272

import pdfplumber
import requests
import re
from typing import List, Dict
import os
from tqdm import tqdm

class PDFParser:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
    
    def download_pdf(self, url: str, filename: str) -> str:
        local_path = os.path.join('data/raw', filename)
        
        if os.path.exists(local_path):
            print(f'PDF уже загружен: {filename}')
            return local_path
        
        try:
            print(f'Загрузка PDF: {url}')
            response = self.session.get(url, stream=True, timeout=60)
            response.raise_for_status()
            
            os.makedirs('data/raw', exist_ok=True)
            
            with open(local_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            
            print(f'PDF сохранен: {local_path}')
            return local_path
            
        except Exception as e:
            print(f'Ошибка загрузки PDF {url}: {e}')
            return None
    
    def parse_pdf(self, pdf_path: str, program_id: str) -> List[Dict]:
        courses = []
        
        try:
            with pdfplumber.open(pdf_path) as pdf:
                print(f'Парсинг PDF: {pdf_path}')
                
                for page_num, page in enumerate(tqdm(pdf.pages, desc='Страницы')):
                    page_courses = self._parse_page(page, page_num + 1, program_id)
                    courses.extend(page_courses)
                
                print(f'Найдено курсов: {len(courses)}')
                
        except Exception as e:
            print(f'Ошибка парсинга PDF {pdf_path}: {e}')
            return courses

        
        return courses
    
    def _parse_page(self, page, page_num: int, program_id: str) -> List[Dict]:
        courses = []
        
        try:
            tables = page.extract_tables()
            
            for table in tables:
                table_courses = self._parse_table(table, page_num, program_id)
                courses.extend(table_courses)
            
            if not courses:
                courses = self._parse_text_fallback(page, page_num, program_id)
                
        except Exception as e:
            print(f'Ошибка парсинга страницы {page_num}: {e}')
        
        return courses
    
    def _parse_table(self, table: list, page_num: int, program_id: str) -> List[Dict]:
        courses = []
        
        if not table or len(table) < 2:
            return courses
        
        headers = [str(cell).lower().strip() if cell else '' for cell in table[0]]
        
        for row_idx, row in enumerate(table[1:], 1):
            if not row or len(row) < 3:
                continue
            
            course = self._extract_course_from_row(row, headers, page_num, program_id)
            if course:
                courses.append(course)
        
        return courses
    
    def _extract_course_from_row(self, row: list, headers: list, page_num: int, program_id: str) -> Dict:
        try:
            row = [str(cell).strip() if cell else '' for cell in row]
            
            name = self._extract_name(row, headers)
            if not name or len(name) < 3:
                return None
            
            semester = self._extract_semester(row, headers)
            credits = self._extract_credits(row, headers)
            hours = self._extract_hours(row, headers)
            course_type = self._extract_type(row, headers)
            
            course = {
                'id': f'{program_id}_{page_num}_{hash(name) % 10000}',
                'program_id': program_id,
                'semester': semester,
                'name': name,
                'credits': credits,
                'hours': hours,
                'type': course_type,
                'source_pdf': os.path.basename(pdf_path),
                'source_page': page_num
            }
            return course

            
        except Exception as e:
            print(f'Ошибка извлечения курса из строки: {e}')
            return None
    
    def _extract_name(self, row: list, headers: list) -> str:
        name_indicators = ['название', 'дисциплина', 'курс', 'предмет', 'name', 'course']
        
        for i, header in enumerate(headers):
            if any(indicator in header for indicator in name_indicators):
                if i < len(row) and row[i]:
                    return row[i]
        
        if len(row) > 0 and row[0]:
            return row[0]
        
        return ''
    
    def _extract_semester(self, row: list, headers: list) -> int:
        semester_indicators = ['семестр', 'semester', 'сем']
        
        for i, header in enumerate(headers):
            if any(indicator in header for indicator in semester_indicators):
                if i < len(row) and row[i]:
                    try:
                        return int(re.findall(r'\d+', row[i])[0])
                    except:
                        pass
        
        return 1
    
    def _extract_credits(self, row: list, headers: list) -> int:
        credit_indicators = ['кредит', 'credit', 'зет', 'з.е.']
        
        for i, header in enumerate(headers):
            if any(indicator in header for indicator in credit_indicators):
                if i < len(row) and row[i]:
                    try:
                        return int(re.findall(r'\d+', row[i])[0])
                    except:
                        pass
        
        return 0
    
    def _extract_hours(self, row: list, headers: list) -> int:
        hour_indicators = ['час', 'hour', 'ауд']
        
        for i, header in enumerate(headers):
            if any(indicator in header for indicator in hour_indicators):
                if i < len(row) and row[i]:
                    try:
                        return int(re.findall(r'\d+', row[i])[0])
                    except:
                        pass
        
        return 0
    
    def _extract_type(self, row: list, headers: list) -> str:
        type_indicators = ['тип', 'type', 'вид']
        
        for i, header in enumerate(headers):
            if any(indicator in header for indicator in type_indicators):
                if i < len(row) and row[i]:
                    text = row[i].lower()
                    if any(word in text for word in ['обязательная', 'required', 'обяз']):
                        return 'required'
                    elif any(word in text for word in ['по выбору', 'elective', 'выбор']):
                        return 'elective'
        
        return 'required'
    
    def _parse_text_fallback(self, page, page_num: int, program_id: str) -> List[Dict]:
        courses = []
        
        try:
            text = page.extract_text()
            if not text:
                return courses
            
            lines = text.split('\n')
            current_semester = 1
            
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                
                if 'семестр' in line.lower():
                    semester_match = re.findall(r'\d+', line)
                    if semester_match:
                        current_semester = int(semester_match[0])
                    continue
                
                if len(line) > 10 and not line.isdigit():
                    course = {
                        'id': f'{program_id}_{page_num}_{hash(line) % 10000}',
                        'program_id': program_id,
                        'semester': current_semester,
                        'name': line,
                        'credits': 0,
                        'hours': 0,
                        'type': 'required',
                        'source_pdf': os.path.basename(program_id),
                        'source_page': page_num
                    }
                    courses.append(course)
                    
        except Exception as e:
            print(f'Ошибка fallback парсинга страницы {page_num}: {e}')
        
        return courses

def main():
    parser = PDFParser()
    
    test_url = 'https://example.com/test.pdf'
    test_filename = 'test.pdf'
    
    local_path = parser.download_pdf(test_url, test_filename)
    if local_path:
        courses = parser.parse_pdf(local_path, 'test_program')
        print(f'Найдено курсов: {len(courses)}')

if __name__ == '__main__':
    main()