Spaces:

VincentCroft
/

FaultDetectionDeepLearning

Sleeping

FaultDetectionDeepLearning / fault_classification_pmu.py

GuanHuaYu student

Fix

61d758d 6 months ago

39.1 kB

	"""Fault classification training utilities for PMU and PV datasets.

	This module trains deep learning models on high-frequency PMU measurements and
	supports classical machine learning baselines so the resulting artefacts can be
	served via the Gradio app in this repository or on Hugging Face Spaces. It
	implements a full training pipeline including preprocessing, sequence
	generation, model definition (CNN-LSTM, Temporal Convolutional Network, or
	Support Vector Machine), evaluation, and export of deployment metadata.

	Example
	-------
	python fault_classification_pmu.py \
	--data-path data/Fault_Classification_PMU_Data.csv \
	--label-column FaultType \
	--model-type tcn \
	--model-out pmu_tcn_model.keras \
	--scaler-out pmu_feature_scaler.pkl \
	--metadata-out pmu_metadata.json

	The script accepts CSV input where each row contains a timestamped PMU
	measurement and a categorical fault label. Features default to the 14 PMU
	channels used in the project documentation, but any subset can be provided
	via the ``--feature-columns`` argument. Data is automatically standardised
	and windowed to create temporal sequences that feed into the neural network.

	The exported metadata JSON file contains the feature ordering, label names,
	sequence length, stride, and chosen architecture. The Gradio front-end
	consumes this file to replicate the same preprocessing steps during inference.
	"""
	from __future__ import annotations

	import argparse
	import json
	import os
	import shutil
	from datetime import datetime
	from pathlib import Path
	from typing import Dict, List, Optional, Sequence, Tuple

	import math

	os.environ.setdefault("CUDA_VISIBLE_DEVICES", "-1")
	os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2")
	os.environ.setdefault("TF_ENABLE_ONEDNN_OPTS", "0")

	import joblib
	import numpy as np
	import pandas as pd
	from pandas.api.types import is_numeric_dtype
	from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import LabelEncoder, StandardScaler
	from sklearn.svm import SVC
	from tensorflow.keras import callbacks, layers, models, optimizers



	class ProgressCallback(callbacks.Callback):
	"""Custom callback to provide training progress updates."""

	def __init__(
	self,
	total_epochs,
	status_file_path=None,
	*,
	status_update_interval: float = 10.0,
	batch_log_frequency: int = 10,
	):
	super().__init__()
	self.total_epochs = total_epochs
	self.status_file_path = status_file_path
	self.status_update_interval = max(1.0, float(status_update_interval))
	self.batch_log_frequency = max(1, int(batch_log_frequency))
	self.current_epoch = 0
	self.train_start_time: Optional[float] = None
	self.last_status_report: Optional[float] = None
	self.total_batches_per_epoch = 0
	self.batches_seen = 0

	# ------------------------------------------------------------------
	# Internal helpers
	# ------------------------------------------------------------------
	def _now(self) -> float:
	import time

	return time.perf_counter()

	def _training_elapsed(self, now: Optional[float] = None) -> float:
	if self.train_start_time is None:
	return 0.0
	if now is None:
	now = self._now()
	return max(0.0, now - self.train_start_time)

	def _report_status(self, message: str, *, force: bool = False) -> None:
	now = self._now()
	if not force and self.last_status_report is not None:
	if now - self.last_status_report < self.status_update_interval:
	return

	print(message, flush=True)

	if self.status_file_path:
	try:
	with open(self.status_file_path, "w") as f:
	f.write(message)
	except Exception:
	# Silently ignore status file failures; progress should still stream to stdout
	pass

	self.last_status_report = now

	# ------------------------------------------------------------------
	# Keras callback overrides
	# ------------------------------------------------------------------
	def on_train_begin(self, logs=None):
	params = self.params or {}
	steps = params.get("steps") or params.get("steps_per_epoch")
	if steps:
	self.total_batches_per_epoch = int(steps)
	else:
	samples = params.get("samples")
	batch_size = params.get("batch_size") or 0
	if samples and batch_size:
	self.total_batches_per_epoch = math.ceil(samples / batch_size)
	else:
	self.total_batches_per_epoch = 0

	self.batches_seen = 0
	self.last_status_report = None
	self.train_start_time = self._now()

	def on_epoch_begin(self, epoch, logs=None):
	import time

	now = self._now()
	if self.train_start_time is None:
	self.train_start_time = now

	self.current_epoch = epoch + 1
	self.batches_seen = 0

	progress_pct = (self.current_epoch / self.total_epochs) * 100
	elapsed_time = self._training_elapsed(now)
	status_msg = (
	f"Training epoch {self.current_epoch}/{self.total_epochs} "
	f"({progress_pct:.1f}%) - {elapsed_time:.1f}s elapsed"
	)
	self._report_status(status_msg, force=True)

	if self.current_epoch == 1:
	wall_clock = time.strftime("%H:%M:%S")
	print(f"Starting first epoch at {wall_clock}", flush=True)

	def on_batch_begin(self, batch, logs=None):
	if self.current_epoch == 1 and batch % self.batch_log_frequency == 0:
	elapsed = self._training_elapsed()
	print(f"Epoch {self.current_epoch}, Batch {batch} started - {elapsed:.1f}s elapsed", flush=True)

	def on_batch_end(self, batch, logs=None):
	self.batches_seen = batch + 1

	if self.current_epoch == 1 and batch % self.batch_log_frequency == 0:
	logs = logs or {}
	loss = logs.get("loss", 0)
	elapsed = self._training_elapsed()
	print(
	f"Epoch {self.current_epoch}, Batch {batch} completed - Loss: {loss:.4f}, {elapsed:.1f}s elapsed",
	flush=True,
	)

	total_batches = self.total_batches_per_epoch or 0
	if not total_batches:
	params = self.params or {}
	total_batches = (
	params.get("steps")
	or params.get("steps_per_epoch")
	or 0
	)

	if total_batches:
	epoch_fraction = min(1.0, (batch + 1) / total_batches)
	else:
	epoch_fraction = 0.0

	overall_progress = (
	(self.current_epoch - 1 + epoch_fraction) / self.total_epochs * 100
	)
	elapsed_time = self._training_elapsed()
	status_msg = (
	f"Epoch {self.current_epoch}/{self.total_epochs} - Batch {batch + 1}/{total_batches or '?'} "
	f"({overall_progress:.1f}%) - {elapsed_time:.1f}s elapsed"
	)
	self._report_status(status_msg)

	def on_epoch_end(self, epoch, logs=None):
	logs = logs or {}
	loss = logs.get("loss", 0)
	val_loss = logs.get("val_loss", 0)
	accuracy = logs.get("accuracy", logs.get("acc", 0))
	val_accuracy = logs.get("val_accuracy", logs.get("val_acc", 0))
	_ = epoch # Suppress unused variable warning

	elapsed_time = self._training_elapsed()
	status_msg = (
	f"Epoch {self.current_epoch}/{self.total_epochs} completed - "
	f"Loss: {loss:.4f}, Val Loss: {val_loss:.4f}, "
	f"Acc: {accuracy:.4f}, Val Acc: {val_accuracy:.4f} - {elapsed_time:.1f}s total"
	)
	self._report_status(status_msg, force=True)

	def on_train_end(self, logs=None):
	total_elapsed = self._training_elapsed()
	final_message = (
	f"Training finished after {self.total_epochs} epoch(s) - "
	f"{total_elapsed:.1f}s total elapsed"
	)
	self._report_status(final_message, force=True)


	# Default PMU feature set as described in the user provided table. Timestamp is
	# intentionally omitted because it is not a model input feature.
	DEFAULT_FEATURE_COLUMNS: List[str] = [
	"[325] UPMU_SUB22:FREQ",
	"[326] UPMU_SUB22:DFDT",
	"[327] UPMU_SUB22:FLAG",
	"[328] UPMU_SUB22-L1:MAG",
	"[329] UPMU_SUB22-L1:ANG",
	"[330] UPMU_SUB22-L2:MAG",
	"[331] UPMU_SUB22-L2:ANG",
	"[332] UPMU_SUB22-L3:MAG",
	"[333] UPMU_SUB22-L3:ANG",
	"[334] UPMU_SUB22-C1:MAG",
	"[335] UPMU_SUB22-C1:ANG",
	"[336] UPMU_SUB22-C2:MAG",
	"[337] UPMU_SUB22-C2:ANG",
	"[338] UPMU_SUB22-C3:MAG",
	"[339] UPMU_SUB22-C3:ANG",
	]

	LABEL_GUESS_CANDIDATES: Tuple[str, ...] = ("Fault", "FaultType", "Label", "Target", "Class")


	def _normalise_column_name(name: str) -> str:
	return str(name).strip().lower()


	def _resolve_label_column(df: pd.DataFrame, requested: str) -> str:
	columns = [str(col) for col in df.columns]
	if not columns:
	raise ValueError("Provided dataframe does not contain any columns.")

	requested = str(requested or "").strip()
	if requested and requested in df.columns:
	return requested

	if requested:
	for col in df.columns:
	if str(col).strip() == requested:
	return str(col)
	lowered = requested.lower()
	lowered_map = {_normalise_column_name(col): str(col) for col in df.columns}
	if lowered in lowered_map:
	return lowered_map[lowered]

	lowered_map = {_normalise_column_name(col): str(col) for col in df.columns}
	for guess in LABEL_GUESS_CANDIDATES:
	key = guess.lower()
	if key in lowered_map:
	return lowered_map[key]

	for col in reversed(df.columns):
	if not is_numeric_dtype(df[col]):
	return str(col)

	available = ", ".join(columns)
	raise ValueError(
	f"Label column '{requested or ' '}' not found in provided dataframe. "
	f"Available columns: {available}"
	)


	def _resolve_features(df: pd.DataFrame, feature_columns: Sequence[str] \| None, label_column: str) -> List[str]:
	if feature_columns:
	missing = [c for c in feature_columns if c not in df.columns]
	if missing:
	raise ValueError(f"Feature columns not present in CSV: {missing}")
	return list(feature_columns)

	# Prefer the documented PMU ordering when the columns exist, falling back to
	# any remaining numeric columns.
	preferred = [c for c in DEFAULT_FEATURE_COLUMNS if c in df.columns]

	excluded = {label_column, label_column.lower(), "timestamp", "Timestamp"}
	remainder = [c for c in df.columns if c not in preferred and c not in excluded]
	ordered = preferred + remainder
	if not ordered:
	raise ValueError("No feature columns detected. Specify --feature-columns explicitly.")
	return ordered


	def load_dataset(
	csv_path: Path,
	*,
	feature_columns: Sequence[str] \| None,
	label_column: str,
	) -> Tuple[np.ndarray, np.ndarray, List[str], str]:
	"""Load the dataset from CSV.

	Parameters
	----------
	csv_path:
	Path to the CSV file containing PMU measurements.
	feature_columns:
	Optional explicit ordering of feature columns.
	label_column:
	Name of the column containing the categorical fault label.

	Returns
	-------
	features: np.ndarray
	2-D array of shape (n_samples, n_features).
	labels: np.ndarray
	1-D array of label strings.
	columns: list[str]
	Actual feature ordering used.
	resolved_label: str
	The column name that supplied the labels.
	"""
	df = pd.read_csv(csv_path, sep=None, engine="python")
	resolved_label = _resolve_label_column(df, label_column)

	columns = _resolve_features(df, feature_columns, resolved_label)
	features = df[columns].astype(np.float32).values
	labels = df[resolved_label].astype(str).values
	return features, labels, columns, resolved_label


	def load_dataset_from_dataframe(
	df: pd.DataFrame,
	*,
	feature_columns: Sequence[str] \| None,
	label_column: str,
	) -> Tuple[np.ndarray, np.ndarray, List[str], str]:
	"""Load dataset arrays directly from a DataFrame."""

	resolved_label = _resolve_label_column(df, label_column)

	columns = _resolve_features(df, feature_columns, resolved_label)
	features = df[columns].astype(np.float32).values
	labels = df[resolved_label].astype(str).values
	return features, labels, columns, resolved_label


	def create_sequences(
	features: np.ndarray,
	labels: np.ndarray,
	*,
	sequence_length: int,
	stride: int,
	) -> Tuple[np.ndarray, np.ndarray]:
	"""Create overlapping sequences suitable for sequence models.

	The label assigned to a sequence corresponds to the label of the final
	timestep in the window. This choice aligns with fault detection use cases
	where the most recent measurement dictates the state of the system.
	"""
	if sequence_length <= 0:
	raise ValueError("sequence_length must be > 0")
	if stride <= 0:
	raise ValueError("stride must be > 0")
	if features.shape[0] != labels.shape[0]:
	raise ValueError("Features and labels must contain the same number of rows")
	if features.shape[0] < sequence_length:
	raise ValueError("Not enough samples to create a single sequence")

	sequences: List[np.ndarray] = []
	seq_labels: List[str] = []
	for start in range(0, features.shape[0] - sequence_length + 1, stride):
	end = start + sequence_length
	sequences.append(features[start:end])
	seq_labels.append(labels[end - 1])
	return np.stack(sequences), np.array(seq_labels)


	def build_cnn_lstm(
	input_shape: Tuple[int, int],
	num_classes: int,
	*,
	conv_filters: int = 128,
	kernel_size: int = 3,
	lstm_units: int = 128,
	dropout: float = 0.3,
	) -> models.Model:
	"""Construct a compact yet expressive CNN-LSTM architecture."""
	inputs = layers.Input(shape=input_shape)
	x = layers.Conv1D(conv_filters, kernel_size, padding="same", activation="relu")(inputs)
	x = layers.BatchNormalization()(x)
	x = layers.Conv1D(conv_filters, kernel_size, dilation_rate=2, padding="same", activation="relu")(x)
	x = layers.BatchNormalization()(x)
	x = layers.Dropout(dropout)(x)
	x = layers.LSTM(lstm_units, return_sequences=False)(x)
	x = layers.Dropout(dropout)(x)
	outputs = layers.Dense(num_classes, activation="softmax")(x)
	model = models.Model(inputs, outputs)
	model.compile(
	optimizer=optimizers.Adam(learning_rate=1e-3),
	loss="sparse_categorical_crossentropy",
	metrics=["accuracy"],
	)
	return model


	def build_tcn(
	input_shape: Tuple[int, int],
	num_classes: int,
	*,
	filters: int = 64,
	kernel_size: int = 3,
	dilations: Sequence[int] = (1, 2, 4, 8),
	dropout: float = 0.2,
	) -> models.Model:
	"""Construct a lightweight Temporal Convolutional Network."""

	inputs = layers.Input(shape=input_shape)
	x = inputs
	for dilation in dilations:
	residual = x
	x = layers.Conv1D(
	filters,
	kernel_size,
	padding="causal",
	activation="relu",
	dilation_rate=dilation,
	)(x)
	x = layers.BatchNormalization()(x)
	x = layers.Dropout(dropout)(x)
	x = layers.Conv1D(
	filters,
	kernel_size,
	padding="causal",
	activation="relu",
	dilation_rate=dilation,
	)(x)
	x = layers.BatchNormalization()(x)
	if residual.shape[-1] != filters:
	residual = layers.Conv1D(filters, 1, padding="same")(residual)
	x = layers.Add()([x, residual])
	x = layers.Activation("relu")(x)

	x = layers.GlobalAveragePooling1D()(x)
	x = layers.Dropout(dropout)(x)
	outputs = layers.Dense(num_classes, activation="softmax")(x)

	model = models.Model(inputs, outputs)
	model.compile(
	optimizer=optimizers.Adam(learning_rate=1e-3),
	loss="sparse_categorical_crossentropy",
	metrics=["accuracy"],
	)
	return model


	def train_model(
	sequences: np.ndarray,
	labels: np.ndarray,
	*,
	validation_split: float,
	batch_size: int,
	epochs: int,
	model_type: str = "cnn_lstm",
	tensorboard_log_dir: Optional[Path] = None,
	status_file_path: Optional[Path] = None,
	) -> Tuple[object, LabelEncoder, Dict[str, object]]:
	"""Train a sequence model and return training history and validation outputs."""

	model_type = model_type.lower().strip()
	if model_type not in {"cnn_lstm", "tcn", "svm"}:
	raise ValueError("model_type must be either 'cnn_lstm', 'tcn', or 'svm'")

	# Handle status file for progress tracking
	status_file = status_file_path if status_file_path else None

	label_encoder = LabelEncoder()
	y = label_encoder.fit_transform(labels)

	if model_type == "svm":
	features = sequences.reshape(sequences.shape[0], -1)
	else:
	features = sequences

	tb_dir: Optional[str] = None
	if model_type != "svm" and tensorboard_log_dir is not None:
	tensorboard_log_dir.mkdir(parents=True, exist_ok=True)
	tb_dir = str(tensorboard_log_dir.resolve())
	else:
	tensorboard_log_dir = None

	# Check if we can use stratification (each class needs at least 2 samples)
	unique_labels, label_counts = np.unique(y, return_counts=True)
	min_samples_per_class = np.min(label_counts)

	print(f"Label distribution: {dict(zip(unique_labels, label_counts))}")
	print(f"Minimum samples per class: {min_samples_per_class}")
	print(f"Total sequences: {len(sequences)}, Features per sequence: {sequences.shape[1:]}")

	# Check for potential memory issues
	import sys
	data_size_mb = sequences.nbytes / (1024 * 1024)
	print(f"Data size: {data_size_mb:.2f} MB")
	if data_size_mb > 1000: # > 1GB
	print("Warning: Large dataset detected. Consider reducing batch size or sequence length.")

	# Validate data ranges
	if np.any(np.isnan(sequences)) or np.any(np.isinf(sequences)):
	print("Warning: NaN or Inf values detected in sequences")
	sequences = np.nan_to_num(sequences, nan=0.0, posinf=1e6, neginf=-1e6)

	# Use stratification only if each class has at least 2 samples
	if min_samples_per_class >= 2:
	X_train, X_val, y_train, y_val = train_test_split(
	features, y, test_size=validation_split, stratify=y, random_state=42
	)
	else:
	print(f"Warning: Some classes have only {min_samples_per_class} sample(s). Using simple random split instead of stratified split.")

	# If validation split would result in empty validation set for some classes,
	# reduce validation split or use a minimum number of samples
	total_samples = len(y)
	if validation_split * total_samples < len(unique_labels):
	# Ensure at least one sample per class in validation if possible
	adjusted_split = max(0.1, len(unique_labels) / total_samples)
	adjusted_split = min(adjusted_split, 0.3) # Cap at 30%
	print(f"Adjusting validation split from {validation_split} to {adjusted_split}")
	validation_split = adjusted_split

	X_train, X_val, y_train, y_val = train_test_split(
	features, y, test_size=validation_split, random_state=42
	)

	if model_type == "cnn_lstm":
	print("Building CNN-LSTM model...")

	# Optimize model for large datasets
	if len(sequences) > 100000:
	print("Using lightweight CNN-LSTM for large dataset")
	model = build_cnn_lstm(
	input_shape=sequences.shape[1:],
	num_classes=len(label_encoder.classes_),
	conv_filters=64, # Reduce from 128
	lstm_units=64, # Reduce from 128
	dropout=0.2 # Reduce dropout
	)
	else:
	model = build_cnn_lstm(
	input_shape=sequences.shape[1:], num_classes=len(label_encoder.classes_)
	)
	print(f"CNN-LSTM model built. Input shape: {sequences.shape[1:]}, Classes: {len(label_encoder.classes_)}")
	print(f"Model parameters: {model.count_params():,}")

	# Adjust callbacks for dataset size
	if len(sequences) > 100000:
	callbacks_list = [
	ProgressCallback(total_epochs=epochs, status_file_path=str(status_file) if status_file else None),
	callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, min_lr=1e-5),
	callbacks.EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True), # More aggressive
	]
	print("Using aggressive callbacks for large dataset")
	else:
	callbacks_list = [
	ProgressCallback(total_epochs=epochs, status_file_path=str(status_file) if status_file else None),
	callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-5),
	callbacks.EarlyStopping(monitor="val_loss", patience=6, restore_best_weights=True),
	]
	if tensorboard_log_dir is not None:
	callbacks_list.insert(-2, callbacks.TensorBoard(log_dir=tb_dir, histogram_freq=0, write_graph=False)) # Reduce TensorBoard overhead

	print(f"Starting CNN-LSTM training with {len(X_train)} training samples, {len(X_val)} validation samples")
	print(f"Batch size: {batch_size}, Epochs: {epochs}")

	if status_file:
	with open(status_file, 'w') as f:
	f.write(f"CNN-LSTM training started - {len(X_train)} train, {len(X_val)} val samples, batch_size={batch_size}")

	history = model.fit(
	X_train,
	y_train,
	validation_data=(X_val, y_val),
	epochs=epochs,
	batch_size=batch_size,
	callbacks=callbacks_list,
	verbose=2,
	)

	print("CNN-LSTM training completed, starting prediction...")
	if status_file:
	with open(status_file, 'w') as f:
	f.write("CNN-LSTM training completed, evaluating model...")

	print(f"Making predictions on {len(X_val)} validation samples...")
	if status_file:
	with open(status_file, 'w') as f:
	f.write(f"Making predictions on {len(X_val)} validation samples...")
	y_pred = model.predict(X_val, verbose=0).argmax(axis=1)
	print("Predictions completed")
	training_history: Dict[str, object] = history.history
	elif model_type == "tcn":
	print("Building TCN model...")
	model = build_tcn(input_shape=sequences.shape[1:], num_classes=len(label_encoder.classes_))
	print(f"TCN model built. Input shape: {sequences.shape[1:]}, Classes: {len(label_encoder.classes_)}")

	callbacks_list = [
	ProgressCallback(total_epochs=epochs, status_file_path=str(status_file) if status_file else None),
	callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-5),
	callbacks.EarlyStopping(monitor="val_loss", patience=6, restore_best_weights=True),
	]
	if tensorboard_log_dir is not None:
	callbacks_list.insert(-2, callbacks.TensorBoard(log_dir=tb_dir, histogram_freq=0, write_graph=False)) # Reduce TensorBoard overhead

	print(f"Starting TCN training with {len(X_train)} training samples, {len(X_val)} validation samples")
	print(f"Batch size: {batch_size}, Epochs: {epochs}")

	if status_file:
	with open(status_file, 'w') as f:
	f.write(f"TCN training started - {len(X_train)} train, {len(X_val)} val samples, batch_size={batch_size}")

	history = model.fit(
	X_train,
	y_train,
	validation_data=(X_val, y_val),
	epochs=epochs,
	batch_size=batch_size,
	callbacks=callbacks_list,
	verbose=2,
	)

	print("TCN training completed, starting prediction...")
	if status_file:
	with open(status_file, 'w') as f:
	f.write("TCN training completed, evaluating model...")

	print(f"Making TCN predictions on {len(X_val)} validation samples...")
	if status_file:
	with open(status_file, 'w') as f:
	f.write(f"Making TCN predictions on {len(X_val)} validation samples...")
	y_pred = model.predict(X_val, verbose=0).argmax(axis=1)
	print("TCN predictions completed")
	training_history = history.history
	else: # svm
	print("Training SVM model...", flush=True)
	if status_file:
	with open(status_file, 'w') as f:
	f.write("Training SVM model...")

	model = SVC(kernel="rbf", probability=True, class_weight="balanced")
	model.fit(X_train, y_train)

	print("SVM training completed. Evaluating...", flush=True)
	if status_file:
	with open(status_file, 'w') as f:
	f.write("SVM training completed. Evaluating...")

	y_pred = model.predict(X_val)
	training_history = {
	"train_accuracy": float(model.score(X_train, y_train)),
	"val_accuracy": float(accuracy_score(y_val, y_pred)),
	}

	cm = confusion_matrix(y_val, y_pred)
	metrics: Dict[str, object] = {
	"history": training_history,
	"validation": {
	"y_true": y_val,
	"y_pred": y_pred,
	"class_names": label_encoder.classes_.tolist(),
	"confusion_matrix": cm,
	},
	"model_type": model_type,
	"input_shape": list(sequences.shape[1:]),
	"tensorboard_log_dir": tb_dir,
	}
	return model, label_encoder, metrics


	def standardise_sequences(sequences: np.ndarray) -> Tuple[np.ndarray, StandardScaler]:
	"""Apply standard scaling per feature across all timesteps."""
	scaler = StandardScaler()
	flattened = sequences.reshape(-1, sequences.shape[-1])
	scaled = scaler.fit_transform(flattened)
	return scaled.reshape(sequences.shape), scaler


	def export_artifacts(
	*,
	model: object,
	scaler: StandardScaler,
	label_encoder: LabelEncoder,
	feature_columns: Sequence[str],
	label_column: str,
	sequence_length: int,
	stride: int,
	model_path: Path,
	scaler_path: Path,
	metadata_path: Path,
	metrics: dict,
	) -> None:
	"""Persist trained assets to disk for deployment."""
	model_path.parent.mkdir(parents=True, exist_ok=True)
	scaler_path.parent.mkdir(parents=True, exist_ok=True)
	metadata_path.parent.mkdir(parents=True, exist_ok=True)
	model_type = str(metrics.get("model_type", "cnn_lstm"))
	if model_type == "svm":
	joblib.dump(model, model_path)
	else:
	model.save(model_path)
	joblib.dump(scaler, scaler_path)

	validation = metrics["validation"]
	report_dict = classification_report(
	validation["y_true"],
	validation["y_pred"],
	target_names=label_encoder.classes_,
	output_dict=True,
	)

	metadata = {
	"feature_columns": list(feature_columns),
	"label_classes": label_encoder.classes_.tolist(),
	"label_column": label_column,
	"sequence_length": sequence_length,
	"stride": stride,
	"model_path": str(model_path),
	"scaler_path": str(scaler_path),
	"training_history": metrics["history"],
	"classification_report": report_dict,
	"model_type": model_type,
	"model_format": "joblib" if model_type == "svm" else "keras",
	"input_shape": metrics.get("input_shape"),
	"tensorboard_log_dir": metrics.get("tensorboard_log_dir"),
	}
	confusion = validation.get("confusion_matrix")
	if confusion is None:
	confusion = confusion_matrix(validation["y_true"], validation["y_pred"])
	metadata["confusion_matrix"] = np.asarray(confusion).tolist()

	metadata_path.write_text(json.dumps(metadata, indent=2))


	def train_from_dataframe(
	df: pd.DataFrame,
	*,
	label_column: str,
	feature_columns: Sequence[str] \| None = None,
	sequence_length: int = 32,
	stride: int = 4,
	validation_split: float = 0.2,
	batch_size: int = 128,
	epochs: int = 50,
	model_type: str = "cnn_lstm",
	model_path: Path \| str = "pmu_cnn_lstm_model.keras",
	scaler_path: Path \| str = "pmu_feature_scaler.pkl",
	metadata_path: Path \| str = "pmu_metadata.json",
	enable_tensorboard: bool = True,
	tensorboard_root: Path \| str \| None = None,
	) -> dict:
	"""Train a PMU fault classification model using an in-memory dataframe."""

	model_path = Path(model_path)
	scaler_path = Path(scaler_path)
	metadata_path = Path(metadata_path)

	# Create status file for progress tracking
	status_file = model_path.parent / "training_status.txt"
	print(f"Training progress will be written to: {status_file}")

	tensorboard_log_dir: Optional[Path] = None
	if enable_tensorboard and model_type.lower() != "svm":
	base_dir = Path(tensorboard_root) if tensorboard_root is not None else Path("tensorboard_runs")
	timestamp = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
	tensorboard_log_dir = base_dir / f"run-{timestamp}"

	features, labels, used_columns, resolved_label = load_dataset_from_dataframe(
	df, feature_columns=feature_columns, label_column=label_column
	)

	print(f"Input data: {len(features)} samples")
	print(f"Creating sequences with length={sequence_length}, stride={stride}")

	sequences, seq_labels = create_sequences(
	features,
	labels,
	sequence_length=sequence_length,
	stride=stride,
	)

	print(f"Generated {len(sequences)} sequences")

	# Validate sequence count and adjust parameters if necessary
	if len(sequences) < 10:
	raise ValueError(
	f"Only {len(sequences)} sequences generated. Need at least 10 for training. "
	f"Try reducing sequence_length (currently {sequence_length}) or stride (currently {stride}), "
	"or provide more data."
	)

	# If very few sequences, recommend SVM instead of deep learning
	if len(sequences) < 100 and model_type in ['cnn_lstm', 'tcn']:
	print(f"Warning: Only {len(sequences)} sequences available. Consider using SVM for small datasets.")

	sequences, scaler = standardise_sequences(sequences)

	# Adjust training parameters based on data size
	original_batch_size = batch_size
	original_epochs = epochs
	original_validation_split = validation_split

	# Handle large datasets (>100K sequences) - optimize for memory and speed
	if len(sequences) > 100000:
	print(f"Large dataset detected ({len(sequences)} sequences). Optimizing parameters...")
	batch_size = min(batch_size * 2, 512) # Increase batch size for efficiency
	epochs = min(epochs, 30) # Reduce epochs for large datasets
	print(f"Adjusted parameters for large dataset:")
	print(f" Batch size: {original_batch_size} -> {batch_size}")
	print(f" Epochs: {original_epochs} -> {epochs}")

	# Force garbage collection
	import gc
	gc.collect()

	elif len(sequences) < 100:
	# For very small datasets
	batch_size = max(min(batch_size, len(sequences) // 4), 4) # Ensure batch_size >= 4
	epochs = min(epochs, 20) # Reduce epochs to prevent overfitting
	validation_split = min(validation_split, 0.3) # Reduce validation split
	print(f"Adjusted parameters for small dataset:")
	print(f" Batch size: {original_batch_size} -> {batch_size}")
	print(f" Epochs: {original_epochs} -> {epochs}")
	print(f" Validation split: {original_validation_split} -> {validation_split}")

	model, label_encoder, metrics = train_model(
	sequences,
	seq_labels,
	validation_split=validation_split,
	batch_size=batch_size,
	epochs=epochs,
	model_type=model_type,
	tensorboard_log_dir=tensorboard_log_dir,
	status_file_path=status_file,
	)

	export_artifacts(
	model=model,
	scaler=scaler,
	label_encoder=label_encoder,
	feature_columns=used_columns,
	label_column=resolved_label,
	sequence_length=sequence_length,
	stride=stride,
	model_path=model_path,
	scaler_path=scaler_path,
	metadata_path=metadata_path,
	metrics=metrics,
	)

	tensorboard_zip_path: Optional[str] = None
	if tensorboard_log_dir and tensorboard_log_dir.exists():
	try:
	tensorboard_zip_path = shutil.make_archive(
	base_name=str(tensorboard_log_dir.parent / tensorboard_log_dir.name),
	format="zip",
	root_dir=str(tensorboard_log_dir.parent),
	base_dir=tensorboard_log_dir.name,
	)
	tensorboard_zip_path = str(Path(tensorboard_zip_path).resolve())
	except Exception:
	tensorboard_zip_path = None

	report_dict = classification_report(
	metrics["validation"]["y_true"],
	metrics["validation"]["y_pred"],
	target_names=metrics["validation"]["class_names"],
	output_dict=True,
	)
	confusion = metrics["validation"].get("confusion_matrix")
	if confusion is None:
	confusion = confusion_matrix(metrics["validation"]["y_true"], metrics["validation"]["y_pred"])

	return {
	"num_samples": int(df.shape[0]),
	"num_sequences": int(sequences.shape[0]),
	"feature_columns": used_columns,
	"class_names": label_encoder.classes_.tolist(),
	"model_path": str(model_path.resolve()),
	"scaler_path": str(scaler_path.resolve()),
	"metadata_path": str(metadata_path.resolve()),
	"history": metrics["history"],
	"model_type": metrics.get("model_type", model_type),
	"classification_report": report_dict,
	"confusion_matrix": np.asarray(confusion).tolist(),
	"tensorboard_log_dir": metrics.get("tensorboard_log_dir"),
	"tensorboard_zip_path": tensorboard_zip_path,
	"label_column": resolved_label,
	}


	def run_training(args: argparse.Namespace) -> None:
	csv_path = Path(args.data_path)
	model_out = Path(args.model_out)
	scaler_out = Path(args.scaler_out)
	metadata_out = Path(args.metadata_out)

	features, labels, feature_columns, resolved_label = load_dataset(
	csv_path, feature_columns=args.feature_columns, label_column=args.label_column
	)

	sequences, seq_labels = create_sequences(
	features,
	labels,
	sequence_length=args.sequence_length,
	stride=args.stride,
	)

	sequences, scaler = standardise_sequences(sequences)
	tensorboard_log_dir: Optional[Path] = None
	if args.tensorboard and args.model_type != "svm":
	if args.tensorboard_log_dir:
	tensorboard_log_dir = Path(args.tensorboard_log_dir)
	else:
	tensorboard_log_dir = Path("tensorboard_runs") / datetime.utcnow().strftime("%Y%m%d-%H%M%S")
	model, label_encoder, metrics = train_model(
	sequences,
	seq_labels,
	validation_split=args.validation_split,
	batch_size=args.batch_size,
	epochs=args.epochs,
	model_type=args.model_type,
	tensorboard_log_dir=tensorboard_log_dir,
	status_file_path=None, # No status file for CLI usage
	)

	export_artifacts(
	model=model,
	scaler=scaler,
	label_encoder=label_encoder,
	feature_columns=feature_columns,
	label_column=resolved_label,
	sequence_length=args.sequence_length,
	stride=args.stride,
	model_path=model_out,
	scaler_path=scaler_out,
	metadata_path=metadata_out,
	metrics=metrics,
	)

	print("Training complete")
	print(f"Model architecture : {args.model_type}")
	print(f"Model saved to : {model_out}")
	print(f"Scaler saved to : {scaler_out}")
	print(f"Metadata saved to : {metadata_out}")
	print("Validation metrics:")
	report = classification_report(
	metrics["validation"]["y_true"], metrics["validation"]["y_pred"], target_names=metrics["validation"]["class_names"]
	)
	print(report)
	if metrics.get("tensorboard_log_dir"):
	tb_dir = metrics["tensorboard_log_dir"]
	print(f"TensorBoard logs written to: {tb_dir}")
	print(f"Launch TensorBoard with: tensorboard --logdir \"{tb_dir}\"")


	def parse_args(argv: Sequence[str] \| None = None) -> argparse.Namespace:
	parser = argparse.ArgumentParser(description="Train a sequence model for PMU fault classification")
	parser.add_argument("--data-path", required=True, help="Path to Fault_Classification_PMU_Data CSV")
	parser.add_argument(
	"--label-column",
	default="Fault",
	help="Name of the target label column (default: Fault)",
	)
	parser.add_argument(
	"--feature-columns",
	nargs="*",
	default=None,
	help="Optional explicit list of feature columns. Defaults to all non-label columns",
	)
	parser.add_argument("--sequence-length", type=int, default=32, help="Number of timesteps per training window")
	parser.add_argument("--stride", type=int, default=4, help="Step size between consecutive windows")
	parser.add_argument("--validation-split", type=float, default=0.2, help="Validation set fraction")
	parser.add_argument("--batch-size", type=int, default=128, help="Training batch size")
	parser.add_argument("--epochs", type=int, default=50, help="Maximum number of training epochs")
	parser.add_argument(
	"--model-type",
	choices=["cnn_lstm", "tcn", "svm"],
	default="cnn_lstm",
	help="Model architecture to train (choices: cnn_lstm, tcn, svm)",
	)
	parser.add_argument("--model-out", default="pmu_cnn_lstm_model.keras", help="Path to save trained Keras model")
	parser.add_argument("--scaler-out", default="pmu_feature_scaler.pkl", help="Path to save fitted StandardScaler")
	parser.add_argument("--metadata-out", default="pmu_metadata.json", help="Path to save metadata JSON")
	parser.add_argument(
	"--tensorboard-log-dir",
	default=None,
	help="Optional directory to write TensorBoard logs (defaults to tensorboard_runs/<timestamp>)",
	)
	parser.add_argument(
	"--no-tensorboard",
	dest="tensorboard",
	action="store_false",
	help="Disable TensorBoard logging for neural network models",
	)
	parser.set_defaults(tensorboard=True)
	return parser.parse_args(argv)


	def main(argv: Sequence[str] \| None = None) -> None:
	args = parse_args(argv)
	run_training(args)


	if __name__ == "__main__":
	main()