| import pdfplumber | |
| import requests | |
| import re | |
| from typing import List, Dict | |
| import os | |
| from tqdm import tqdm | |
| class PDFParser: | |
| def __init__(self): | |
| self.session = requests.Session() | |
| self.session.headers.update({ | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
| }) | |
| def download_pdf(self, url: str, filename: str) -> str: | |
| local_path = os.path.join('data/raw', filename) | |
| if os.path.exists(local_path): | |
| print(f'PDF уже загружен: {filename}') | |
| return local_path | |
| try: | |
| print(f'Загрузка PDF: {url}') | |
| response = self.session.get(url, stream=True, timeout=60) | |
| response.raise_for_status() | |
| os.makedirs('data/raw', exist_ok=True) | |
| with open(local_path, 'wb') as f: | |
| for chunk in response.iter_content(chunk_size=8192): | |
| f.write(chunk) | |
| print(f'PDF сохранен: {local_path}') | |
| return local_path | |
| except Exception as e: | |
| print(f'Ошибка загрузки PDF {url}: {e}') | |
| return None | |
| def parse_pdf(self, pdf_path: str, program_id: str) -> List[Dict]: | |
| courses = [] | |
| try: | |
| with pdfplumber.open(pdf_path) as pdf: | |
| print(f'Парсинг PDF: {pdf_path}') | |
| for page_num, page in enumerate(tqdm(pdf.pages, desc='Страницы')): | |
| page_courses = self._parse_page(page, page_num + 1, program_id) | |
| courses.extend(page_courses) | |
| print(f'Найдено курсов: {len(courses)}') | |
| except Exception as e: | |
| print(f'Ошибка парсинга PDF {pdf_path}: {e}') | |
| return courses | |
| return courses | |
| def _parse_page(self, page, page_num: int, program_id: str) -> List[Dict]: | |
| courses = [] | |
| try: | |
| tables = page.extract_tables() | |
| for table in tables: | |
| table_courses = self._parse_table(table, page_num, program_id) | |
| courses.extend(table_courses) | |
| if not courses: | |
| courses = self._parse_text_fallback(page, page_num, program_id) | |
| except Exception as e: | |
| print(f'Ошибка парсинга страницы {page_num}: {e}') | |
| return courses | |
| def _parse_table(self, table: list, page_num: int, program_id: str) -> List[Dict]: | |
| courses = [] | |
| if not table or len(table) < 2: | |
| return courses | |
| headers = [str(cell).lower().strip() if cell else '' for cell in table[0]] | |
| for row_idx, row in enumerate(table[1:], 1): | |
| if not row or len(row) < 3: | |
| continue | |
| course = self._extract_course_from_row(row, headers, page_num, program_id) | |
| if course: | |
| courses.append(course) | |
| return courses | |
| def _extract_course_from_row(self, row: list, headers: list, page_num: int, program_id: str) -> Dict: | |
| try: | |
| row = [str(cell).strip() if cell else '' for cell in row] | |
| name = self._extract_name(row, headers) | |
| if not name or len(name) < 3: | |
| return None | |
| semester = self._extract_semester(row, headers) | |
| credits = self._extract_credits(row, headers) | |
| hours = self._extract_hours(row, headers) | |
| course_type = self._extract_type(row, headers) | |
| course = { | |
| 'id': f'{program_id}_{page_num}_{hash(name) % 10000}', | |
| 'program_id': program_id, | |
| 'semester': semester, | |
| 'name': name, | |
| 'credits': credits, | |
| 'hours': hours, | |
| 'type': course_type, | |
| 'source_pdf': os.path.basename(pdf_path), | |
| 'source_page': page_num | |
| } | |
| return course | |
| except Exception as e: | |
| print(f'Ошибка извлечения курса из строки: {e}') | |
| return None | |
| def _extract_name(self, row: list, headers: list) -> str: | |
| name_indicators = ['название', 'дисциплина', 'курс', 'предмет', 'name', 'course'] | |
| for i, header in enumerate(headers): | |
| if any(indicator in header for indicator in name_indicators): | |
| if i < len(row) and row[i]: | |
| return row[i] | |
| if len(row) > 0 and row[0]: | |
| return row[0] | |
| return '' | |
| def _extract_semester(self, row: list, headers: list) -> int: | |
| semester_indicators = ['семестр', 'semester', 'сем'] | |
| for i, header in enumerate(headers): | |
| if any(indicator in header for indicator in semester_indicators): | |
| if i < len(row) and row[i]: | |
| try: | |
| return int(re.findall(r'\d+', row[i])[0]) | |
| except: | |
| pass | |
| return 1 | |
| def _extract_credits(self, row: list, headers: list) -> int: | |
| credit_indicators = ['кредит', 'credit', 'зет', 'з.е.'] | |
| for i, header in enumerate(headers): | |
| if any(indicator in header for indicator in credit_indicators): | |
| if i < len(row) and row[i]: | |
| try: | |
| return int(re.findall(r'\d+', row[i])[0]) | |
| except: | |
| pass | |
| return 0 | |
| def _extract_hours(self, row: list, headers: list) -> int: | |
| hour_indicators = ['час', 'hour', 'ауд'] | |
| for i, header in enumerate(headers): | |
| if any(indicator in header for indicator in hour_indicators): | |
| if i < len(row) and row[i]: | |
| try: | |
| return int(re.findall(r'\d+', row[i])[0]) | |
| except: | |
| pass | |
| return 0 | |
| def _extract_type(self, row: list, headers: list) -> str: | |
| type_indicators = ['тип', 'type', 'вид'] | |
| for i, header in enumerate(headers): | |
| if any(indicator in header for indicator in type_indicators): | |
| if i < len(row) and row[i]: | |
| text = row[i].lower() | |
| if any(word in text for word in ['обязательная', 'required', 'обяз']): | |
| return 'required' | |
| elif any(word in text for word in ['по выбору', 'elective', 'выбор']): | |
| return 'elective' | |
| return 'required' | |
| def _parse_text_fallback(self, page, page_num: int, program_id: str) -> List[Dict]: | |
| courses = [] | |
| try: | |
| text = page.extract_text() | |
| if not text: | |
| return courses | |
| lines = text.split('\n') | |
| current_semester = 1 | |
| for line in lines: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| if 'семестр' in line.lower(): | |
| semester_match = re.findall(r'\d+', line) | |
| if semester_match: | |
| current_semester = int(semester_match[0]) | |
| continue | |
| if len(line) > 10 and not line.isdigit(): | |
| course = { | |
| 'id': f'{program_id}_{page_num}_{hash(line) % 10000}', | |
| 'program_id': program_id, | |
| 'semester': current_semester, | |
| 'name': line, | |
| 'credits': 0, | |
| 'hours': 0, | |
| 'type': 'required', | |
| 'source_pdf': os.path.basename(program_id), | |
| 'source_page': page_num | |
| } | |
| courses.append(course) | |
| except Exception as e: | |
| print(f'Ошибка fallback парсинга страницы {page_num}: {e}') | |
| return courses | |
| def main(): | |
| parser = PDFParser() | |
| test_url = 'https://example.com/test.pdf' | |
| test_filename = 'test.pdf' | |
| local_path = parser.download_pdf(test_url, test_filename) | |
| if local_path: | |
| courses = parser.parse_pdf(local_path, 'test_program') | |
| print(f'Найдено курсов: {len(courses)}') | |
| if __name__ == '__main__': | |
| main() | |