ITMO-QA / scraper /normalize.py
vydrking's picture
Upload 25 files
c204272 verified
import re
import hashlib
from typing import List, Dict
class DataNormalizer:
def __init__(self):
self.tag_keywords = {
'ml': ['машинное обучение', 'machine learning', 'ml', 'алгоритм', 'модель'],
'dl': ['глубокое обучение', 'deep learning', 'нейронная сеть', 'cnn', 'rnn', 'transformer'],
'nlp': ['nlp', 'обработка естественного языка', 'natural language', 'текст', 'язык'],
'cv': ['компьютерное зрение', 'computer vision', 'cv', 'изображение', 'видео'],
'math': ['математика', 'математический', 'алгебра', 'геометрия', 'анализ'],
'stats': ['статистика', 'вероятность', 'статистический', 'probability'],
'product': ['продукт', 'product', 'разработка продукта', 'продуктовая'],
'business': ['бизнес', 'business', 'менеджмент', 'управление', 'экономика'],
'pm': ['project management', 'управление проектами', 'pm', 'проект'],
'systems': ['система', 'system', 'архитектура', 'инфраструктура'],
'data': ['данные', 'data', 'анализ данных', 'big data', 'база данных']
}
def normalize_courses(self, courses: List[Dict]) -> List[Dict]:
normalized_courses = []
seen_hashes = set()
for course in courses:
normalized = self._normalize_course(course)
if normalized:
course_hash = self._calculate_course_hash(normalized)
if course_hash not in seen_hashes:
seen_hashes.add(course_hash)
normalized_courses.append(normalized)
return normalized_courses
def _normalize_course(self, course: Dict) -> Dict:
if not course.get('name'):
return None
normalized = course.copy()
normalized['name'] = self._normalize_name(course['name'])
normalized['short_desc'] = self._generate_short_desc(course)
normalized['tags'] = self._generate_tags(normalized)
normalized['semester'] = self._normalize_semester(course.get('semester', 1))
normalized['credits'] = self._normalize_credits(course.get('credits', 0))
normalized['hours'] = self._normalize_hours(course.get('hours', 0))
normalized['type'] = self._normalize_type(course.get('type', 'required'))
return normalized
def _normalize_name(self, name: str) -> str:
if not name:
return ''
name = str(name).strip()
name = re.sub(r'\s+', ' ', name)
name = name.replace('"', '').replace('"', '')
return name
def _generate_short_desc(self, course: dict) -> str:
name = course.get('name', '')
desc = course.get('description', '')
if desc:
desc = str(desc).strip()
if len(desc) > 220:
desc = desc[:220] + '...'
return desc
if name and len(name) > 50:
return name[:220]
return 'Курс из учебного плана программы'
def _generate_tags(self, course: Dict) -> List[str]:
text = f"{course.get('name', '')} {course.get('short_desc', '')}".lower()
tags = []
for tag, keywords in self.tag_keywords.items():
if any(keyword in text for keyword in keywords):
tags.append(tag)
return tags
def _normalize_semester(self, semester) -> int:
try:
semester = int(semester)
if 1 <= semester <= 4:
return semester
except (ValueError, TypeError):
pass
return 1
def _normalize_credits(self, credits) -> int:
try:
credits = int(credits)
if credits >= 0:
return credits
except (ValueError, TypeError):
pass
return 0
def _normalize_hours(self, hours) -> int:
try:
hours = int(hours)
if hours >= 0:
return hours
except (ValueError, TypeError):
pass
return 0
def _normalize_type(self, course_type: str) -> str:
if not course_type:
return 'required'
type_lower = str(course_type).lower()
if any(word in type_lower for word in ['обязательная', 'required', 'обяз']):
return 'required'
elif any(word in type_lower for word in ['по выбору', 'elective', 'выбор']):
return 'elective'
return 'required'
def _calculate_course_hash(self, course: Dict) -> str:
text = f"{course.get('name', '')}{course.get('program_id', '')}{course.get('semester', '')}"
return hashlib.md5(text.encode()).hexdigest()
def merge_courses(self, courses_list: List[List[Dict]]) -> List[Dict]:
all_courses = []
for courses in courses_list:
all_courses.extend(courses)
return self.normalize_courses(all_courses)
def validate_course(self, course: Dict) -> bool:
required_fields = ['name', 'program_id', 'semester']
for field in required_fields:
if not course.get(field):
return False
if len(course.get('name', '')) < 3:
return False
return True
def get_statistics(self, courses: List[Dict]) -> Dict:
stats = {
'total_courses': len(courses),
'by_program': {},
'by_semester': {},
'by_type': {},
'by_tags': {}
}
for course in courses:
program_id = course.get('program_id', 'unknown')
semester = course.get('semester', 1)
course_type = course.get('type', 'required')
tags = course.get('tags', [])
stats['by_program'][program_id] = stats['by_program'].get(program_id, 0) + 1
stats['by_semester'][semester] = stats['by_semester'].get(semester, 0) + 1
stats['by_type'][course_type] = stats['by_type'].get(course_type, 0) + 1
for tag in tags:
stats['by_tags'][tag] = stats['by_tags'].get(tag, 0) + 1
return stats
def main():
normalizer = DataNormalizer()
test_courses = [
{
'id': 'test_1',
'program_id': 'ai',
'name': 'Машинное обучение',
'semester': 1,
'credits': 6,
'type': 'required'
},
{
'id': 'test_2',
'program_id': 'ai_product',
'name': 'Глубокое обучение',
'semester': 2,
'credits': 4,
'type': 'elective'
}
]
normalized = normalizer.normalize_courses(test_courses)
stats = normalizer.get_statistics(normalized)
print(f'Нормализовано курсов: {len(normalized)}')
print(f'Статистика: {stats}')
if __name__ == '__main__':
main()