Skip to main content

Data Exploration

This tutorial provides comprehensive guidance for exploring and visualizing Enertel forecast data. Use this as your first step when working with a data extract to understand the structure, quality, and characteristics of your forecast dataset.

Prerequisites

Before diving into analysis, review our core concepts to understand Enertel-specific terminology used throughout this tutorial.

Sample Data

We provide sample forecast data for your specific price nodes as part of our free trial. Contact us to request your customized dataset.

Setup and Imports

Start by importing the necessary libraries for data manipulation, visualization, and time handling:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set display options for better DataFrame viewing
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

Loading and Initial Inspection

Load Your Data

# Load the CSV file provided by Enertel
df = pd.read_csv('data_sample_from_enertel.csv')

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Display basic information about the dataset
df.info()

Data Structure Overview

print("=== Dataset Overview ===")
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"Forecast vintages: {df['scheduled_at'].min()} to {df['scheduled_at'].max()}")
print(f"Price nodes: {df['object_name'].nunique()}")
print(f"Data series: {df['series_name'].nunique()}")
print(f"Models: {df['model_id'].nunique()}")
print(f"Scenarios: {df['scenario_id'].nunique()}")

# Show sample of the data
print("\n=== Sample Data ===")
print(df.head())

Data Quality Assessment

Check for Duplicate Records

The composite key for forecast data should be unique across timestamp, feature, scenario, and model:

# Define the composite key columns
key_columns = ['timestamp', 'feature_id', 'scenario_id', 'model_id']

# Check for duplicates
duplicates = df.duplicated(subset=key_columns)
duplicate_count = duplicates.sum()

if duplicate_count == 0:
print("✓ No duplicate forecasts found")
else:
print(f"⚠ Found {duplicate_count} duplicate forecasts")
print("Sample duplicates:")
print(df[duplicates][key_columns + ['object_name', 'p50']].head())

# Show the distribution of composite key sizes
key_sizes = df.groupby(key_columns).size()
print(f"\nComposite key statistics:")
print(f"Unique combinations: {len(key_sizes):,}")
print(f"Max occurrences per key: {key_sizes.max()}")

Validate Timestamp Ranges

# Convert timestamps to datetime for analysis
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['scheduled_at'] = pd.to_datetime(df['scheduled_at'])

# Check timestamp validity
print("=== Timestamp Analysis ===")
print(f"Forecast timestamps:")
print(f" Range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f" Span: {(df['timestamp'].max() - df['timestamp'].min()).days} days")

print(f"\nVintage timestamps:")
print(f" Range: {df['scheduled_at'].min()} to {df['scheduled_at'].max()}")
print(f" Span: {(df['scheduled_at'].max() - df['scheduled_at'].min()).days} days")

# Check for future forecasts (scheduled_at should be before timestamp)
invalid_times = df[df['scheduled_at'] >= df['timestamp']]
if len(invalid_times) == 0:
print("✓ All forecasts have valid timing (scheduled before forecast time)")
else:
print(f"⚠ Found {len(invalid_times)} forecasts with invalid timing")

Missing Data Analysis

# Check for missing values
missing_data = df.isnull().sum()
missing_pct = (missing_data / len(df)) * 100

print("=== Missing Data Analysis ===")
if missing_data.sum() == 0:
print("✓ No missing values found")
else:
print("Missing values by column:")
missing_summary = pd.DataFrame({
'Missing Count': missing_data[missing_data > 0],
'Missing %': missing_pct[missing_pct > 0]
}).round(2)
print(missing_summary)

Market and Geographic Analysis

Identify Markets and Price Nodes

print("=== Market Coverage ===")

# Price nodes by market
if 'object_iso' in df.columns:
market_coverage = df.groupby('object_iso')['object_name'].nunique().sort_values(ascending=False)
print("Price nodes by ISO:")
print(market_coverage)

# Unique price nodes
print(f"\nAll price nodes ({df['object_name'].nunique()}):")
price_nodes = df['object_name'].unique()
for i, node in enumerate(sorted(price_nodes)):
print(f" {i+1:2d}. {node}")

# Data series coverage
print(f"\nData series ({df['series_name'].nunique()}):")
series_coverage = df.groupby('series_name').agg({
'object_name': 'nunique',
'timestamp': ['min', 'max'],
'p50': 'count'
}).round(2)
series_coverage.columns = ['Price Nodes', 'Start Date', 'End Date', 'Forecast Count']
print(series_coverage)

Geographic Distribution

# If geographic coordinates are available
if 'object_latitude' in df.columns and 'object_longitude' in df.columns:
print("=== Geographic Coverage ===")

# Remove any missing coordinates
geo_data = df[['object_name', 'object_latitude', 'object_longitude']].dropna().drop_duplicates()

print(f"Price nodes with coordinates: {len(geo_data)}")
print(f"Latitude range: {geo_data['object_latitude'].min():.2f} to {geo_data['object_latitude'].max():.2f}")
print(f"Longitude range: {geo_data['object_longitude'].min():.2f} to {geo_data['object_longitude'].max():.2f}")

# Simple geographic plot
plt.figure(figsize=(10, 6))
plt.scatter(geo_data['object_longitude'], geo_data['object_latitude'], alpha=0.7)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Geographic Distribution of Price Nodes')
plt.grid(True, alpha=0.3)
plt.show()

Forecast Performance Analysis

Price Distribution Analysis

print("=== Price Forecast Analysis ===")

# Overall price statistics
price_stats = df['p50'].describe()
print("P50 forecast statistics:")
print(price_stats)

# Price statistics by series
print("\nPrice statistics by data series:")
series_stats = df.groupby('series_name')['p50'].describe().round(2)
print(series_stats)

# Identify extreme price events
high_price_threshold = df['p50'].quantile(0.95)
low_price_threshold = df['p50'].quantile(0.05)

print(f"\nExtreme price events:")
print(f"High price threshold (95th percentile): ${high_price_threshold:.2f}")
print(f"Low price threshold (5th percentile): ${low_price_threshold:.2f}")

high_price_events = df[df['p50'] >= high_price_threshold]
print(f"High price events: {len(high_price_events)} ({len(high_price_events)/len(df)*100:.1f}%)")

Historical Performance Analysis

If actual values are available in your dataset:

if 'actual' in df.columns:
print("=== Forecast Accuracy Analysis ===")

# Calculate forecast errors
df['error'] = df['p50'] - df['actual']
df['abs_error'] = np.abs(df['error'])
df['squared_error'] = df['error'] ** 2

# Overall accuracy metrics
mae = df['abs_error'].mean()
rmse = np.sqrt(df['squared_error'].mean())
mape = (df['abs_error'] / df['actual']).mean() * 100

print(f"Overall forecast accuracy:")
print(f" MAE (Mean Absolute Error): ${mae:.2f}")
print(f" RMSE (Root Mean Square Error): ${rmse:.2f}")
print(f" MAPE (Mean Absolute Percentage Error): {mape:.1f}%")

# Accuracy by price node
accuracy_by_node = df.groupby('object_name').agg({
'abs_error': 'mean',
'squared_error': lambda x: np.sqrt(x.mean()),
'actual': 'count'
}).round(2)
accuracy_by_node.columns = ['MAE', 'RMSE', 'Forecast Count']
accuracy_by_node = accuracy_by_node.sort_values('MAE')

print(f"\nTop 5 most accurate price nodes:")
print(accuracy_by_node.head())

print(f"\nTop 5 least accurate price nodes:")
print(accuracy_by_node.tail())

Data Visualization

Price Distribution Histograms

# Create histograms for each price node
unique_objects = df['object_name'].unique()
num_objects = len(unique_objects)

if num_objects <= 10: # Only create individual plots if manageable number
fig, axes = plt.subplots(num_objects, 1, figsize=(10, 3 * num_objects), sharex=True)

# Handle single object case
if num_objects == 1:
axes = [axes]

for ax, obj_name in zip(axes, unique_objects):
data = df[df['object_name'] == obj_name]['p50']
ax.hist(data, bins=30, edgecolor='black', alpha=0.7, color='skyblue')
ax.set_title(f'P50 Forecast Distribution - {obj_name}', fontsize=12)
ax.set_ylabel('Frequency')
ax.grid(True, linestyle='--', alpha=0.6)

# Add statistics text
mean_val = data.mean()
std_val = data.std()
ax.axvline(mean_val, color='red', linestyle='--', alpha=0.7, label=f'Mean: ${mean_val:.1f}')
ax.legend()

plt.xlabel('Price ($/MWh)', fontsize=12)
plt.tight_layout()
plt.show()
else:
# For many price nodes, create a summary plot
plt.figure(figsize=(12, 6))
df['p50'].hist(bins=50, edgecolor='black', alpha=0.7)
plt.title('Overall P50 Forecast Distribution (All Price Nodes)')
plt.xlabel('Price ($/MWh)')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
plt.show()

Time Series Overview

# Time series plot for a sample price node
sample_node = df['object_name'].iloc[0]
sample_data = df[df['object_name'] == sample_node].sort_values('timestamp')

plt.figure(figsize=(14, 8))

# Plot forecast uncertainty bands
plt.fill_between(sample_data['timestamp'], sample_data['p10'], sample_data['p90'],
alpha=0.3, color='blue', label='80% Confidence Interval (P10-P90)')
plt.fill_between(sample_data['timestamp'], sample_data['p25'], sample_data['p75'],
alpha=0.5, color='blue', label='50% Confidence Interval (P25-P75)')

# Plot median forecast
plt.plot(sample_data['timestamp'], sample_data['p50'],
color='darkblue', linewidth=2, label='Median Forecast (P50)')

# Plot actual values if available
if 'actual' in df.columns:
plt.plot(sample_data['timestamp'], sample_data['actual'],
color='red', linewidth=1.5, label='Actual Prices', alpha=0.8)

plt.title(f'Forecast Time Series - {sample_node}', fontsize=14, fontweight='bold')
plt.xlabel('Time')
plt.ylabel('Price ($/MWh)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Model Performance Comparison

if df['model_id'].nunique() > 1:
print("=== Model Comparison ===")

# Compare models by accuracy (if actuals available)
if 'actual' in df.columns:
model_performance = df.groupby('model_id').agg({
'abs_error': 'mean',
'squared_error': lambda x: np.sqrt(x.mean()),
'actual': 'count'
}).round(2)
model_performance.columns = ['MAE', 'RMSE', 'Forecast Count']
model_performance = model_performance.sort_values('MAE')

print("Model performance ranking (by MAE):")
print(model_performance)

# Visualize model performance
plt.figure(figsize=(10, 6))
model_performance[['MAE', 'RMSE']].plot(kind='bar')
plt.title('Model Performance Comparison')
plt.ylabel('Error ($/MWh)')
plt.xlabel('Model ID')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

Advanced Analysis

Forecast Horizon Analysis

# Calculate operating hour offset (time between forecast creation and target time)
df['operating_hour_offset'] = (df['timestamp'] - df['scheduled_at']).dt.total_seconds() / 3600

print("=== Forecast Horizon Analysis ===")
print(f"Operating hour offset range: {df['operating_hour_offset'].min():.1f} to {df['operating_hour_offset'].max():.1f} hours")

# Analyze accuracy by forecast horizon (if actuals available)
if 'actual' in df.columns:
# Group by forecast horizon
horizon_bins = [0, 6, 12, 24, 48, 72, float('inf')]
horizon_labels = ['0-6h', '6-12h', '12-24h', '24-48h', '48-72h', '72h+']
df['horizon_group'] = pd.cut(df['operating_hour_offset'], bins=horizon_bins, labels=horizon_labels)

horizon_accuracy = df.groupby('horizon_group').agg({
'abs_error': 'mean',
'squared_error': lambda x: np.sqrt(x.mean()),
'actual': 'count'
}).round(2)
horizon_accuracy.columns = ['MAE', 'RMSE', 'Count']

print("\nAccuracy by forecast horizon:")
print(horizon_accuracy)

# Visualize horizon performance
plt.figure(figsize=(10, 6))
horizon_accuracy[['MAE', 'RMSE']].plot(kind='bar')
plt.title('Forecast Accuracy by Time Horizon')
plt.ylabel('Error ($/MWh)')
plt.xlabel('Forecast Horizon')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

Seasonal and Temporal Patterns

# Add time-based features
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.day_name()
df['month'] = df['timestamp'].dt.month_name()

# Hourly price patterns
hourly_patterns = df.groupby('hour')['p50'].agg(['mean', 'std']).round(2)

plt.figure(figsize=(12, 6))
plt.plot(hourly_patterns.index, hourly_patterns['mean'], marker='o', linewidth=2)
plt.fill_between(hourly_patterns.index,
hourly_patterns['mean'] - hourly_patterns['std'],
hourly_patterns['mean'] + hourly_patterns['std'],
alpha=0.3)
plt.title('Average Price Forecast by Hour of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Average P50 Forecast ($/MWh)')
plt.grid(True, alpha=0.3)
plt.xticks(range(0, 24))
plt.tight_layout()
plt.show()

print("=== Temporal Patterns ===")
print("Average prices by hour of day:")
print(hourly_patterns)

Summary Report

def generate_summary_report(df):
"""Generate a comprehensive summary report of the dataset."""

print("=" * 60)
print("ENERTEL FORECAST DATA EXPLORATION SUMMARY")
print("=" * 60)

# Dataset overview
print(f"\n📊 DATASET OVERVIEW")
print(f" Total forecasts: {len(df):,}")
print(f" Date range: {df['timestamp'].min().date()} to {df['timestamp'].max().date()}")
print(f" Price nodes: {df['object_name'].nunique()}")
print(f" Data series: {df['series_name'].nunique()}")
print(f" Models: {df['model_id'].nunique()}")

# Data quality
print(f"\n✅ DATA QUALITY")
print(f" Missing values: {df.isnull().sum().sum()}")
print(f" Duplicate forecasts: {df.duplicated(subset=['timestamp', 'feature_id', 'scenario_id', 'model_id']).sum()}")

# Price analysis
print(f"\n💰 PRICE ANALYSIS")
print(f" Average P50 forecast: ${df['p50'].mean():.2f}")
print(f" Price range: ${df['p50'].min():.2f} - ${df['p50'].max():.2f}")
print(f" High price events (>95th percentile): {(df['p50'] > df['p50'].quantile(0.95)).sum()}")

# Accuracy (if available)
if 'actual' in df.columns:
mae = np.abs(df['p50'] - df['actual']).mean()
rmse = np.sqrt(((df['p50'] - df['actual']) ** 2).mean())
print(f"\n🎯 FORECAST ACCURACY")
print(f" Mean Absolute Error: ${mae:.2f}")
print(f" Root Mean Square Error: ${rmse:.2f}")

print(f"\n{'=' * 60}")
print("Exploration complete! Proceed to detailed analysis tutorials.")
print("=" * 60)

# Generate final summary
generate_summary_report(df)

Next Steps

After completing this exploration, you're ready to proceed with more detailed analysis:

  1. Day-Ahead Evaluation: Analyze day-ahead forecast performance with detailed metrics and visualizations
  2. Hourly Evaluation: Examine real-time forecast accuracy for short-term trading decisions
  3. Custom Analysis: Use the patterns identified here to build custom analysis workflows

Tips for Success

  • Start broad, then narrow: Use this exploration to identify interesting patterns, then dive deeper with focused analysis
  • Validate assumptions: Always check data quality before drawing conclusions
  • Document findings: Keep notes on interesting patterns for future reference
  • Iterate: Return to exploration as you learn more about your specific use case
Data Visualization Tools

We recommend the VS Code extension "Data Wrangler" for interactive DataFrame exploration and the Jupyter notebook environment for iterative analysis.

Happy exploring! 🚀