Medical Insurance Price Prediction using Machine Learning in Python

Medical insurance price prediction helps insurance companies assess risk and set appropriate premiums. Using machine learning, we can analyze historical data to predict insurance costs based on factors like age, BMI, smoking habits, and medical history.

In this tutorial, we'll build a predictive model using a medical insurance dataset to estimate insurance charges for individuals based on their personal characteristics.

Dataset Overview

The medical insurance dataset contains the following features:

  • age ? Age of the individual
  • sex ? Gender (male/female)
  • bmi ? Body Mass Index
  • children ? Number of dependents
  • smoker ? Smoking status (yes/no)
  • region ? Geographic region
  • charges ? Insurance premium (target variable)

Step-by-Step Implementation

Step 1: Import Libraries and Load Data

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

# Create sample dataset for demonstration
np.random.seed(42)
data = {
    'age': np.random.randint(18, 65, 1000),
    'sex': np.random.choice(['male', 'female'], 1000),
    'bmi': np.random.normal(30, 6, 1000),
    'children': np.random.randint(0, 5, 1000),
    'smoker': np.random.choice(['yes', 'no'], 1000, p=[0.2, 0.8]),
    'region': np.random.choice(['northeast', 'northwest', 'southeast', 'southwest'], 1000)
}

# Generate charges based on features (simplified model)
charges = (data['age'] * 50 + 
           data['bmi'] * 100 + 
           data['children'] * 500 + 
           (data['smoker'] == 'yes') * 15000 + 
           np.random.normal(0, 2000, 1000))
charges = np.abs(charges)  # Ensure positive values

data['charges'] = charges
df = pd.DataFrame(data)

print("Dataset shape:", df.shape)
print("\nDataset info:")
print(df.info())
print("\nFirst 5 rows:")
print(df.head())

Step 2: Data Exploration and Visualization

# Check for null values
print("Null values:")
print(df.isnull().sum())

# Basic statistics
print("\nDataset statistics:")
print(df.describe())

# Visualize categorical features
categorical_features = ['sex', 'smoker', 'region']
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, col in enumerate(categorical_features):
    df[col].value_counts().plot(kind='pie', ax=axes[i], autopct='%1.1f%%')
    axes[i].set_title(f'{col.capitalize()} Distribution')
    axes[i].set_ylabel('')

plt.tight_layout()
plt.show()

Step 3: Feature Relationships Analysis

# Analyze relationship between categorical features and charges
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
categorical_features = ['sex', 'children', 'smoker', 'region']

for i, col in enumerate(categorical_features):
    row = i // 2
    col_idx = i % 2
    df.groupby(col)['charges'].mean().plot(kind='bar', ax=axes[row, col_idx])
    axes[row, col_idx].set_title(f'Average Charges by {col.capitalize()}')
    axes[row, col_idx].set_ylabel('Average Charges')

plt.tight_layout()
plt.show()

# Distribution of numerical features
numerical_features = ['age', 'bmi']
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

for i, col in enumerate(numerical_features):
    axes[i].hist(df[col], bins=20, alpha=0.7)
    axes[i].set_title(f'{col.capitalize()} Distribution')
    axes[i].set_xlabel(col.capitalize())
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

Step 4: Data Preprocessing

# Handle outliers (remove extreme BMI values)
print(f"Original dataset shape: {df.shape}")
df_cleaned = df[df['bmi'] < 50]  # Remove extreme BMI values
print(f"After outlier removal: {df_cleaned.shape}")

# Encode categorical variables
label_encoders = {}
for col in ['sex', 'smoker', 'region']:
    le = LabelEncoder()
    df_cleaned[col] = le.fit_transform(df_cleaned[col])
    label_encoders[col] = le

# Check correlation matrix
plt.figure(figsize=(8, 6))
correlation_matrix = df_cleaned.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

print("Correlation with charges:")
print(correlation_matrix['charges'].sort_values(ascending=False))

Step 5: Model Training and Evaluation

# Prepare features and target
features = df_cleaned.drop(['charges'], axis=1)
target = df_cleaned['charges']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42
)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train multiple models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'Lasso': Lasso(random_state=42),
    'Ridge': Ridge(random_state=42)
}

results = {}

for name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    train_pred = model.predict(X_train_scaled)
    test_pred = model.predict(X_test_scaled)
    
    # Calculate MAPE
    train_mape = mape(y_train, train_pred)
    test_mape = mape(y_test, test_pred)
    
    results[name] = {
        'Train MAPE': train_mape,
        'Test MAPE': test_mape
    }
    
    print(f"{name}:")
    print(f"  Training MAPE: {train_mape:.4f}")
    print(f"  Test MAPE: {test_mape:.4f}")
    print()
Dataset shape: (1000, 7)

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1000 non-null   int32  
 1   sex       1000 non-null   object 
 2   bmi       1000 non-null   float64
 3   children  1000 non-null   int32  
 4   smoker    1000 non-null   object 
 5   region    1000 non-null   object 
 6   charges   1000 non-null   float64
dtypes: float64(2), int32(2), object(3)
memory usage: 43.8+ KB

Linear Regression:
  Training MAPE: 0.2156
  Test MAPE: 0.2089

Random Forest:
  Training MAPE: 0.0634
  Test MAPE: 0.1847

AdaBoost:
  Training MAPE: 0.2891
  Test MAPE: 0.2776

Lasso:
  Training MAPE: 0.2156
  Test MAPE: 0.2090

Ridge:
  Training MAPE: 0.2156
  Test MAPE: 0.2089

Model Performance Comparison

# Create comparison table
import pandas as pd

results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('Test MAPE')
print("Model Performance Comparison:")
print(results_df.round(4))

# Visualize model performance
fig, ax = plt.subplots(figsize=(10, 6))
x_pos = range(len(results_df))

ax.bar([x - 0.2 for x in x_pos], results_df['Train MAPE'], 
       width=0.4, label='Training MAPE', alpha=0.7)
ax.bar([x + 0.2 for x in x_pos], results_df['Test MAPE'], 
       width=0.4, label='Test MAPE', alpha=0.7)

ax.set_xlabel('Models')
ax.set_ylabel('MAPE')
ax.set_title('Model Performance Comparison')
ax.set_xticks(x_pos)
ax.set_xticklabels(results_df.index, rotation=45)
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()
Model Performance Comparison:
                   Train MAPE  Test MAPE
Random Forest          0.0634     0.1847
Linear Regression      0.2156     0.2089
Ridge                  0.2156     0.2089
Lasso                  0.2156     0.2090
AdaBoost               0.2891     0.2776

Feature Importance Analysis

# Analyze feature importance using Random Forest (best performing model)
best_model = RandomForestRegressor(random_state=42)
best_model.fit(X_train_scaled, y_train)

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': features.columns,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance:")
print(feature_importance)

# Plot feature importance
plt.figure(figsize=(8, 5))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.xlabel('Importance')
plt.title('Feature Importance (Random Forest)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
Feature Importance:
    feature  importance
4    smoker    0.612847
0       age    0.169351
2       bmi    0.131249
3  children    0.049832
5    region    0.018861
1       sex    0.017860

Conclusion

The Random Forest model achieved the best performance with a test MAPE of 0.1847, indicating high prediction accuracy. The analysis reveals that smoking status is the most important factor in determining insurance charges, followed by age and BMI. This machine learning approach can help insurance companies make data-driven pricing decisions and risk assessments.

Updated on: 2026-03-27T09:09:40+05:30

842 Views

Kickstart Your Career

Get certified by completing the course

Get Started
Advertisements