Article Categories

Selected Reading

How to resume Python Machine Learning if the Machine has restarted?

Machine Learning Artificial Intelligence Python

Machine learning model training can take hours or days, making unexpected system restarts a major concern. Fortunately, Python provides several strategies to resume your work seamlessly after interruptions. This article explores practical approaches to implement checkpointing, data persistence, and recovery mechanisms.

Strategy 1: Implementing Model Checkpoints

Checkpointing saves your model's state at regular intervals during training. This allows you to resume from the last saved state instead of starting over ?

TensorFlow Checkpoints

import tensorflow as tf
from tensorflow import keras
import numpy as np

# Create sample data
x_train = np.random.random((1000, 32))
y_train = np.random.randint(2, size=(1000, 1))

# Build model
model = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(32,)),
    keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Setup checkpoint callback
checkpoint_path = "training_checkpoints/cp-{epoch:04d}.ckpt"
cp_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq=5)  # Save every 5 epochs

# Train with checkpointing
model.fit(x_train, y_train, epochs=10, callbacks=[cp_callback])

# To resume training, load the latest checkpoint
latest = tf.train.latest_checkpoint('training_checkpoints')
model.load_weights(latest)
print(f"Resumed from checkpoint: {latest}")

PyTorch Checkpoints

import torch
import torch.nn as nn
import torch.optim as optim

# Define a simple model
class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(32, 64)
        self.fc2 = nn.Linear(64, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return torch.sigmoid(self.fc2(x))

# Initialize model and optimizer
model = SimpleNet()
optimizer = optim.Adam(model.parameters())

# Save checkpoint function
def save_checkpoint(model, optimizer, epoch, loss, filename):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss
    }
    torch.save(checkpoint, filename)

# Load checkpoint function
def load_checkpoint(filename, model, optimizer):
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    return checkpoint['epoch'], checkpoint['loss']

# Example usage
save_checkpoint(model, optimizer, epoch=5, loss=0.25, filename='model_checkpoint.pth')
epoch, loss = load_checkpoint('model_checkpoint.pth', model, optimizer)
print(f"Resumed from epoch {epoch} with loss {loss:.4f}")

Strategy 2: Persisting Preprocessed Data

Save time by storing preprocessed features and datasets to avoid repeating expensive computations ?

import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Create sample dataset
data = pd.DataFrame({
    'feature1': np.random.randn(1000),
    'feature2': np.random.randn(1000),
    'target': np.random.randint(0, 2, 1000)
})

# Preprocessing
X = data[['feature1', 'feature2']]
y = data['target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)

# Save preprocessed data and scaler
with open('preprocessed_data.pkl', 'wb') as f:
    pickle.dump({
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test,
        'scaler': scaler
    }, f)

# Load preprocessed data after restart
with open('preprocessed_data.pkl', 'rb') as f:
    data_dict = pickle.load(f)
    X_train_loaded = data_dict['X_train']
    scaler_loaded = data_dict['scaler']

print(f"Loaded training data shape: {X_train_loaded.shape}")
print("Preprocessing complete - ready to resume training!")

Strategy 3: Cloud-Based Storage Integration

Use cloud storage to backup checkpoints and ensure accessibility across different machines ?

import boto3
import os

# AWS S3 integration example
def upload_checkpoint_to_s3(local_file, bucket_name, s3_key):
    s3_client = boto3.client('s3')
    try:
        s3_client.upload_file(local_file, bucket_name, s3_key)
        print(f"Checkpoint uploaded to s3://{bucket_name}/{s3_key}")
    except Exception as e:
        print(f"Upload failed: {e}")

def download_checkpoint_from_s3(bucket_name, s3_key, local_file):
    s3_client = boto3.client('s3')
    try:
        s3_client.download_file(bucket_name, s3_key, local_file)
        print(f"Checkpoint downloaded to {local_file}")
        return True
    except Exception as e:
        print(f"Download failed: {e}")
        return False

# Usage
bucket = "my-ml-checkpoints"
upload_checkpoint_to_s3("model_checkpoint.pth", bucket, "project1/checkpoint.pth")
download_checkpoint_from_s3(bucket, "project1/checkpoint.pth", "restored_checkpoint.pth")

Strategy 4: Complete Training Script with Resume Capability

import os
import json
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import pickle

class ResumableTrainer:
    def __init__(self, checkpoint_dir="checkpoints"):
        self.checkpoint_dir = checkpoint_dir
        self.progress_file = os.path.join(checkpoint_dir, "progress.json")
        os.makedirs(checkpoint_dir, exist_ok=True)
        
    def save_progress(self, epoch, model, metrics):
        # Save model
        model_path = os.path.join(self.checkpoint_dir, f"model_epoch_{epoch}.pkl")
        with open(model_path, 'wb') as f:
            pickle.dump(model, f)
            
        # Save progress info
        progress = {
            'current_epoch': epoch,
            'model_path': model_path,
            'metrics': metrics
        }
        with open(self.progress_file, 'w') as f:
            json.dump(progress, f)
            
        print(f"Checkpoint saved at epoch {epoch}")
    
    def load_progress(self):
        if os.path.exists(self.progress_file):
            with open(self.progress_file, 'r') as f:
                progress = json.load(f)
            
            with open(progress['model_path'], 'rb') as f:
                model = pickle.load(f)
                
            print(f"Resumed from epoch {progress['current_epoch']}")
            return progress['current_epoch'], model, progress['metrics']
        return 0, None, {}

# Example usage
trainer = ResumableTrainer()

# Generate sample data
X = np.random.randn(1000, 10)
y = np.random.randint(0, 2, 1000)

# Try to resume
start_epoch, model, metrics = trainer.load_progress()

if model is None:
    print("Starting training from scratch")
    model = RandomForestClassifier(n_estimators=10)
    start_epoch = 0

# Simulate training epochs
for epoch in range(start_epoch, 5):
    # Fit model (in real scenario, this might be incremental training)
    model.n_estimators += 10
    model.fit(X, y)
    
    # Calculate metrics
    accuracy = model.score(X, y)
    metrics[f'epoch_{epoch}'] = {'accuracy': accuracy}
    
    # Save checkpoint every epoch
    trainer.save_progress(epoch + 1, model, metrics)
    print(f"Epoch {epoch + 1} completed, accuracy: {accuracy:.4f}")

print("Training completed successfully!")

Best Practices Comparison

Strategy	Use Case	Advantages	Considerations
Model Checkpoints	Deep learning training	Resume exact state	Storage space
Data Persistence	Complex preprocessing	Saves computation time	Version compatibility
Cloud Storage	Multi-machine training	Accessible anywhere	Network dependency
Containerization	Environment consistency	Portable setup	Container size

Conclusion

Implementing checkpoint systems and data persistence strategies ensures your machine learning work can resume seamlessly after system restarts. Choose the approach that best fits your project's complexity and infrastructure requirements for maximum robustness.

Premansh Sharma

Updated on: 2026-03-27T01:10:51+05:30

504 Views

Previous Next