File size: 8,891 Bytes
c204272 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 |
import pdfplumber
import requests
import re
from typing import List, Dict
import os
from tqdm import tqdm
class PDFParser:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
def download_pdf(self, url: str, filename: str) -> str:
local_path = os.path.join('data/raw', filename)
if os.path.exists(local_path):
print(f'PDF уже загружен: {filename}')
return local_path
try:
print(f'Загрузка PDF: {url}')
response = self.session.get(url, stream=True, timeout=60)
response.raise_for_status()
os.makedirs('data/raw', exist_ok=True)
with open(local_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f'PDF сохранен: {local_path}')
return local_path
except Exception as e:
print(f'Ошибка загрузки PDF {url}: {e}')
return None
def parse_pdf(self, pdf_path: str, program_id: str) -> List[Dict]:
courses = []
try:
with pdfplumber.open(pdf_path) as pdf:
print(f'Парсинг PDF: {pdf_path}')
for page_num, page in enumerate(tqdm(pdf.pages, desc='Страницы')):
page_courses = self._parse_page(page, page_num + 1, program_id)
courses.extend(page_courses)
print(f'Найдено курсов: {len(courses)}')
except Exception as e:
print(f'Ошибка парсинга PDF {pdf_path}: {e}')
return courses
return courses
def _parse_page(self, page, page_num: int, program_id: str) -> List[Dict]:
courses = []
try:
tables = page.extract_tables()
for table in tables:
table_courses = self._parse_table(table, page_num, program_id)
courses.extend(table_courses)
if not courses:
courses = self._parse_text_fallback(page, page_num, program_id)
except Exception as e:
print(f'Ошибка парсинга страницы {page_num}: {e}')
return courses
def _parse_table(self, table: list, page_num: int, program_id: str) -> List[Dict]:
courses = []
if not table or len(table) < 2:
return courses
headers = [str(cell).lower().strip() if cell else '' for cell in table[0]]
for row_idx, row in enumerate(table[1:], 1):
if not row or len(row) < 3:
continue
course = self._extract_course_from_row(row, headers, page_num, program_id)
if course:
courses.append(course)
return courses
def _extract_course_from_row(self, row: list, headers: list, page_num: int, program_id: str) -> Dict:
try:
row = [str(cell).strip() if cell else '' for cell in row]
name = self._extract_name(row, headers)
if not name or len(name) < 3:
return None
semester = self._extract_semester(row, headers)
credits = self._extract_credits(row, headers)
hours = self._extract_hours(row, headers)
course_type = self._extract_type(row, headers)
course = {
'id': f'{program_id}_{page_num}_{hash(name) % 10000}',
'program_id': program_id,
'semester': semester,
'name': name,
'credits': credits,
'hours': hours,
'type': course_type,
'source_pdf': os.path.basename(pdf_path),
'source_page': page_num
}
return course
except Exception as e:
print(f'Ошибка извлечения курса из строки: {e}')
return None
def _extract_name(self, row: list, headers: list) -> str:
name_indicators = ['название', 'дисциплина', 'курс', 'предмет', 'name', 'course']
for i, header in enumerate(headers):
if any(indicator in header for indicator in name_indicators):
if i < len(row) and row[i]:
return row[i]
if len(row) > 0 and row[0]:
return row[0]
return ''
def _extract_semester(self, row: list, headers: list) -> int:
semester_indicators = ['семестр', 'semester', 'сем']
for i, header in enumerate(headers):
if any(indicator in header for indicator in semester_indicators):
if i < len(row) and row[i]:
try:
return int(re.findall(r'\d+', row[i])[0])
except:
pass
return 1
def _extract_credits(self, row: list, headers: list) -> int:
credit_indicators = ['кредит', 'credit', 'зет', 'з.е.']
for i, header in enumerate(headers):
if any(indicator in header for indicator in credit_indicators):
if i < len(row) and row[i]:
try:
return int(re.findall(r'\d+', row[i])[0])
except:
pass
return 0
def _extract_hours(self, row: list, headers: list) -> int:
hour_indicators = ['час', 'hour', 'ауд']
for i, header in enumerate(headers):
if any(indicator in header for indicator in hour_indicators):
if i < len(row) and row[i]:
try:
return int(re.findall(r'\d+', row[i])[0])
except:
pass
return 0
def _extract_type(self, row: list, headers: list) -> str:
type_indicators = ['тип', 'type', 'вид']
for i, header in enumerate(headers):
if any(indicator in header for indicator in type_indicators):
if i < len(row) and row[i]:
text = row[i].lower()
if any(word in text for word in ['обязательная', 'required', 'обяз']):
return 'required'
elif any(word in text for word in ['по выбору', 'elective', 'выбор']):
return 'elective'
return 'required'
def _parse_text_fallback(self, page, page_num: int, program_id: str) -> List[Dict]:
courses = []
try:
text = page.extract_text()
if not text:
return courses
lines = text.split('\n')
current_semester = 1
for line in lines:
line = line.strip()
if not line:
continue
if 'семестр' in line.lower():
semester_match = re.findall(r'\d+', line)
if semester_match:
current_semester = int(semester_match[0])
continue
if len(line) > 10 and not line.isdigit():
course = {
'id': f'{program_id}_{page_num}_{hash(line) % 10000}',
'program_id': program_id,
'semester': current_semester,
'name': line,
'credits': 0,
'hours': 0,
'type': 'required',
'source_pdf': os.path.basename(program_id),
'source_page': page_num
}
courses.append(course)
except Exception as e:
print(f'Ошибка fallback парсинга страницы {page_num}: {e}')
return courses
def main():
parser = PDFParser()
test_url = 'https://example.com/test.pdf'
test_filename = 'test.pdf'
local_path = parser.download_pdf(test_url, test_filename)
if local_path:
courses = parser.parse_pdf(local_path, 'test_program')
print(f'Найдено курсов: {len(courses)}')
if __name__ == '__main__':
main()
|