| import re | |
| import hashlib | |
| from typing import List, Dict | |
| class DataNormalizer: | |
| def __init__(self): | |
| self.tag_keywords = { | |
| 'ml': ['машинное обучение', 'machine learning', 'ml', 'алгоритм', 'модель'], | |
| 'dl': ['глубокое обучение', 'deep learning', 'нейронная сеть', 'cnn', 'rnn', 'transformer'], | |
| 'nlp': ['nlp', 'обработка естественного языка', 'natural language', 'текст', 'язык'], | |
| 'cv': ['компьютерное зрение', 'computer vision', 'cv', 'изображение', 'видео'], | |
| 'math': ['математика', 'математический', 'алгебра', 'геометрия', 'анализ'], | |
| 'stats': ['статистика', 'вероятность', 'статистический', 'probability'], | |
| 'product': ['продукт', 'product', 'разработка продукта', 'продуктовая'], | |
| 'business': ['бизнес', 'business', 'менеджмент', 'управление', 'экономика'], | |
| 'pm': ['project management', 'управление проектами', 'pm', 'проект'], | |
| 'systems': ['система', 'system', 'архитектура', 'инфраструктура'], | |
| 'data': ['данные', 'data', 'анализ данных', 'big data', 'база данных'] | |
| } | |
| def normalize_courses(self, courses: List[Dict]) -> List[Dict]: | |
| normalized_courses = [] | |
| seen_hashes = set() | |
| for course in courses: | |
| normalized = self._normalize_course(course) | |
| if normalized: | |
| course_hash = self._calculate_course_hash(normalized) | |
| if course_hash not in seen_hashes: | |
| seen_hashes.add(course_hash) | |
| normalized_courses.append(normalized) | |
| return normalized_courses | |
| def _normalize_course(self, course: Dict) -> Dict: | |
| if not course.get('name'): | |
| return None | |
| normalized = course.copy() | |
| normalized['name'] = self._normalize_name(course['name']) | |
| normalized['short_desc'] = self._generate_short_desc(course) | |
| normalized['tags'] = self._generate_tags(normalized) | |
| normalized['semester'] = self._normalize_semester(course.get('semester', 1)) | |
| normalized['credits'] = self._normalize_credits(course.get('credits', 0)) | |
| normalized['hours'] = self._normalize_hours(course.get('hours', 0)) | |
| normalized['type'] = self._normalize_type(course.get('type', 'required')) | |
| return normalized | |
| def _normalize_name(self, name: str) -> str: | |
| if not name: | |
| return '' | |
| name = str(name).strip() | |
| name = re.sub(r'\s+', ' ', name) | |
| name = name.replace('"', '').replace('"', '') | |
| return name | |
| def _generate_short_desc(self, course: dict) -> str: | |
| name = course.get('name', '') | |
| desc = course.get('description', '') | |
| if desc: | |
| desc = str(desc).strip() | |
| if len(desc) > 220: | |
| desc = desc[:220] + '...' | |
| return desc | |
| if name and len(name) > 50: | |
| return name[:220] | |
| return 'Курс из учебного плана программы' | |
| def _generate_tags(self, course: Dict) -> List[str]: | |
| text = f"{course.get('name', '')} {course.get('short_desc', '')}".lower() | |
| tags = [] | |
| for tag, keywords in self.tag_keywords.items(): | |
| if any(keyword in text for keyword in keywords): | |
| tags.append(tag) | |
| return tags | |
| def _normalize_semester(self, semester) -> int: | |
| try: | |
| semester = int(semester) | |
| if 1 <= semester <= 4: | |
| return semester | |
| except (ValueError, TypeError): | |
| pass | |
| return 1 | |
| def _normalize_credits(self, credits) -> int: | |
| try: | |
| credits = int(credits) | |
| if credits >= 0: | |
| return credits | |
| except (ValueError, TypeError): | |
| pass | |
| return 0 | |
| def _normalize_hours(self, hours) -> int: | |
| try: | |
| hours = int(hours) | |
| if hours >= 0: | |
| return hours | |
| except (ValueError, TypeError): | |
| pass | |
| return 0 | |
| def _normalize_type(self, course_type: str) -> str: | |
| if not course_type: | |
| return 'required' | |
| type_lower = str(course_type).lower() | |
| if any(word in type_lower for word in ['обязательная', 'required', 'обяз']): | |
| return 'required' | |
| elif any(word in type_lower for word in ['по выбору', 'elective', 'выбор']): | |
| return 'elective' | |
| return 'required' | |
| def _calculate_course_hash(self, course: Dict) -> str: | |
| text = f"{course.get('name', '')}{course.get('program_id', '')}{course.get('semester', '')}" | |
| return hashlib.md5(text.encode()).hexdigest() | |
| def merge_courses(self, courses_list: List[List[Dict]]) -> List[Dict]: | |
| all_courses = [] | |
| for courses in courses_list: | |
| all_courses.extend(courses) | |
| return self.normalize_courses(all_courses) | |
| def validate_course(self, course: Dict) -> bool: | |
| required_fields = ['name', 'program_id', 'semester'] | |
| for field in required_fields: | |
| if not course.get(field): | |
| return False | |
| if len(course.get('name', '')) < 3: | |
| return False | |
| return True | |
| def get_statistics(self, courses: List[Dict]) -> Dict: | |
| stats = { | |
| 'total_courses': len(courses), | |
| 'by_program': {}, | |
| 'by_semester': {}, | |
| 'by_type': {}, | |
| 'by_tags': {} | |
| } | |
| for course in courses: | |
| program_id = course.get('program_id', 'unknown') | |
| semester = course.get('semester', 1) | |
| course_type = course.get('type', 'required') | |
| tags = course.get('tags', []) | |
| stats['by_program'][program_id] = stats['by_program'].get(program_id, 0) + 1 | |
| stats['by_semester'][semester] = stats['by_semester'].get(semester, 0) + 1 | |
| stats['by_type'][course_type] = stats['by_type'].get(course_type, 0) + 1 | |
| for tag in tags: | |
| stats['by_tags'][tag] = stats['by_tags'].get(tag, 0) + 1 | |
| return stats | |
| def main(): | |
| normalizer = DataNormalizer() | |
| test_courses = [ | |
| { | |
| 'id': 'test_1', | |
| 'program_id': 'ai', | |
| 'name': 'Машинное обучение', | |
| 'semester': 1, | |
| 'credits': 6, | |
| 'type': 'required' | |
| }, | |
| { | |
| 'id': 'test_2', | |
| 'program_id': 'ai_product', | |
| 'name': 'Глубокое обучение', | |
| 'semester': 2, | |
| 'credits': 4, | |
| 'type': 'elective' | |
| } | |
| ] | |
| normalized = normalizer.normalize_courses(test_courses) | |
| stats = normalizer.get_statistics(normalized) | |
| print(f'Нормализовано курсов: {len(normalized)}') | |
| print(f'Статистика: {stats}') | |
| if __name__ == '__main__': | |
| main() | |