Article Categories
- All Categories
-
Data Structure
-
Networking
-
RDBMS
-
Operating System
-
Java
-
MS Excel
-
iOS
-
HTML
-
CSS
-
Android
-
Python
-
C Programming
-
C++
-
C#
-
MongoDB
-
MySQL
-
Javascript
-
PHP
-
Economics & Finance
Medical Insurance Price Prediction using Machine Learning in Python
Medical insurance price prediction helps insurance companies assess risk and set appropriate premiums. Using machine learning, we can analyze historical data to predict insurance costs based on factors like age, BMI, smoking habits, and medical history.
In this tutorial, we'll build a predictive model using a medical insurance dataset to estimate insurance charges for individuals based on their personal characteristics.
Dataset Overview
The medical insurance dataset contains the following features:
- age ? Age of the individual
- sex ? Gender (male/female)
- bmi ? Body Mass Index
- children ? Number of dependents
- smoker ? Smoking status (yes/no)
- region ? Geographic region
- charges ? Insurance premium (target variable)
Step-by-Step Implementation
Step 1: Import Libraries and Load Data
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
# Create sample dataset for demonstration
np.random.seed(42)
data = {
'age': np.random.randint(18, 65, 1000),
'sex': np.random.choice(['male', 'female'], 1000),
'bmi': np.random.normal(30, 6, 1000),
'children': np.random.randint(0, 5, 1000),
'smoker': np.random.choice(['yes', 'no'], 1000, p=[0.2, 0.8]),
'region': np.random.choice(['northeast', 'northwest', 'southeast', 'southwest'], 1000)
}
# Generate charges based on features (simplified model)
charges = (data['age'] * 50 +
data['bmi'] * 100 +
data['children'] * 500 +
(data['smoker'] == 'yes') * 15000 +
np.random.normal(0, 2000, 1000))
charges = np.abs(charges) # Ensure positive values
data['charges'] = charges
df = pd.DataFrame(data)
print("Dataset shape:", df.shape)
print("\nDataset info:")
print(df.info())
print("\nFirst 5 rows:")
print(df.head())
Step 2: Data Exploration and Visualization
# Check for null values
print("Null values:")
print(df.isnull().sum())
# Basic statistics
print("\nDataset statistics:")
print(df.describe())
# Visualize categorical features
categorical_features = ['sex', 'smoker', 'region']
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for i, col in enumerate(categorical_features):
df[col].value_counts().plot(kind='pie', ax=axes[i], autopct='%1.1f%%')
axes[i].set_title(f'{col.capitalize()} Distribution')
axes[i].set_ylabel('')
plt.tight_layout()
plt.show()
Step 3: Feature Relationships Analysis
# Analyze relationship between categorical features and charges
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
categorical_features = ['sex', 'children', 'smoker', 'region']
for i, col in enumerate(categorical_features):
row = i // 2
col_idx = i % 2
df.groupby(col)['charges'].mean().plot(kind='bar', ax=axes[row, col_idx])
axes[row, col_idx].set_title(f'Average Charges by {col.capitalize()}')
axes[row, col_idx].set_ylabel('Average Charges')
plt.tight_layout()
plt.show()
# Distribution of numerical features
numerical_features = ['age', 'bmi']
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
for i, col in enumerate(numerical_features):
axes[i].hist(df[col], bins=20, alpha=0.7)
axes[i].set_title(f'{col.capitalize()} Distribution')
axes[i].set_xlabel(col.capitalize())
axes[i].set_ylabel('Frequency')
plt.tight_layout()
plt.show()
Step 4: Data Preprocessing
# Handle outliers (remove extreme BMI values)
print(f"Original dataset shape: {df.shape}")
df_cleaned = df[df['bmi'] < 50] # Remove extreme BMI values
print(f"After outlier removal: {df_cleaned.shape}")
# Encode categorical variables
label_encoders = {}
for col in ['sex', 'smoker', 'region']:
le = LabelEncoder()
df_cleaned[col] = le.fit_transform(df_cleaned[col])
label_encoders[col] = le
# Check correlation matrix
plt.figure(figsize=(8, 6))
correlation_matrix = df_cleaned.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()
print("Correlation with charges:")
print(correlation_matrix['charges'].sort_values(ascending=False))
Step 5: Model Training and Evaluation
# Prepare features and target
features = df_cleaned.drop(['charges'], axis=1)
target = df_cleaned['charges']
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
features, target, test_size=0.2, random_state=42
)
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train multiple models
models = {
'Linear Regression': LinearRegression(),
'Random Forest': RandomForestRegressor(random_state=42),
'AdaBoost': AdaBoostRegressor(random_state=42),
'Lasso': Lasso(random_state=42),
'Ridge': Ridge(random_state=42)
}
results = {}
for name, model in models.items():
# Train the model
model.fit(X_train_scaled, y_train)
# Make predictions
train_pred = model.predict(X_train_scaled)
test_pred = model.predict(X_test_scaled)
# Calculate MAPE
train_mape = mape(y_train, train_pred)
test_mape = mape(y_test, test_pred)
results[name] = {
'Train MAPE': train_mape,
'Test MAPE': test_mape
}
print(f"{name}:")
print(f" Training MAPE: {train_mape:.4f}")
print(f" Test MAPE: {test_mape:.4f}")
print()
Dataset shape: (1000, 7) Dataset info: <class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 1000 non-null int32 1 sex 1000 non-null object 2 bmi 1000 non-null float64 3 children 1000 non-null int32 4 smoker 1000 non-null object 5 region 1000 non-null object 6 charges 1000 non-null float64 dtypes: float64(2), int32(2), object(3) memory usage: 43.8+ KB Linear Regression: Training MAPE: 0.2156 Test MAPE: 0.2089 Random Forest: Training MAPE: 0.0634 Test MAPE: 0.1847 AdaBoost: Training MAPE: 0.2891 Test MAPE: 0.2776 Lasso: Training MAPE: 0.2156 Test MAPE: 0.2090 Ridge: Training MAPE: 0.2156 Test MAPE: 0.2089
Model Performance Comparison
# Create comparison table
import pandas as pd
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('Test MAPE')
print("Model Performance Comparison:")
print(results_df.round(4))
# Visualize model performance
fig, ax = plt.subplots(figsize=(10, 6))
x_pos = range(len(results_df))
ax.bar([x - 0.2 for x in x_pos], results_df['Train MAPE'],
width=0.4, label='Training MAPE', alpha=0.7)
ax.bar([x + 0.2 for x in x_pos], results_df['Test MAPE'],
width=0.4, label='Test MAPE', alpha=0.7)
ax.set_xlabel('Models')
ax.set_ylabel('MAPE')
ax.set_title('Model Performance Comparison')
ax.set_xticks(x_pos)
ax.set_xticklabels(results_df.index, rotation=45)
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
Model Performance Comparison:
Train MAPE Test MAPE
Random Forest 0.0634 0.1847
Linear Regression 0.2156 0.2089
Ridge 0.2156 0.2089
Lasso 0.2156 0.2090
AdaBoost 0.2891 0.2776
Feature Importance Analysis
# Analyze feature importance using Random Forest (best performing model)
best_model = RandomForestRegressor(random_state=42)
best_model.fit(X_train_scaled, y_train)
# Get feature importance
feature_importance = pd.DataFrame({
'feature': features.columns,
'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)
print("Feature Importance:")
print(feature_importance)
# Plot feature importance
plt.figure(figsize=(8, 5))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.xlabel('Importance')
plt.title('Feature Importance (Random Forest)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
Feature Importance:
feature importance
4 smoker 0.612847
0 age 0.169351
2 bmi 0.131249
3 children 0.049832
5 region 0.018861
1 sex 0.017860
Conclusion
The Random Forest model achieved the best performance with a test MAPE of 0.1847, indicating high prediction accuracy. The analysis reveals that smoking status is the most important factor in determining insurance charges, followed by age and BMI. This machine learning approach can help insurance companies make data-driven pricing decisions and risk assessments.
