!pip install --upgrade eazyml-counterfactual
!pip install gdown python-dotenv

import os
import numpy as np
import pandas as pd
import eazyml as ez
from eazyml_counterfactual import (
        ez_cf_inference,
        ez_init        
)
import gdown

from dotenv import load_dotenv
load_dotenv()

# Scikit-learn libraries for model building
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

ez_init(os.getenv('EAZYML_ACCESS_KEY'))

{'success': True,
 'message': 'Initialized successfully. You may revoke your consent to sharing usage stats anytime. You have exclusive paid access.'}

gdown.download_folder(id='1p7Udh2MjKyJPxI47FS89VowAz9ZEq_hG')

# Defining file paths for training and test datasets and specifying the outcome variable
train_file = os.path.join('data', "House Price Prediction - Train Data.xlsx")
test_file = os.path.join('data', "House Price Prediction - Test Data.xlsx")
outcome = "House_Price"

# Loading the training dataset and the test dataset
train_df = pd.read_excel(train_file)
test_df = pd.read_excel(test_file)

# Display the first few rows of the training DataFrame for inspection
ez.ez_display_df(train_df.head())

class UnifiedRegressorPreprocessor:
    """Preprocessor for handling numerical and categorical features, 
    including scaling, encoding, and missing value imputation."""

    def __init__(self):
        self.numerical_imputer = SimpleImputer(strategy="mean")
        self.scaler = StandardScaler()
        self.categorical_encoder = OneHotEncoder(drop="first", sparse=False)
        self.target_scaler = StandardScaler()
        self.fitted = False

    def fit(self, X, y=None):
        """Fits preprocessing transformations on numerical & categorical features and target variable (if provided)."""
        self.numerical_columns = X.select_dtypes(include=[np.number]).columns
        self.categorical_columns = X.select_dtypes(include=[object]).columns

        self.numerical_imputer.fit(X[self.numerical_columns])
        self.scaler.fit(X[self.numerical_columns])
        self.categorical_encoder.fit(X[self.categorical_columns])

        if y is not None:
            self.target_scaler.fit(np.array(y).reshape(-1, 1))

        self.fitted = True

    def transform(self, X, y=None):
        """Applies fitted transformations to the dataset."""
        if not self.fitted:
            raise ValueError("Preprocessor not fitted. Call 'fit' first.")

        X_num = self.scaler.transform(self.numerical_imputer.transform(X[self.numerical_columns]))
        X_cat = self.categorical_encoder.transform(X[self.categorical_columns])
        feature_names = list(self.numerical_columns) + list(self.categorical_encoder.get_feature_names_out())

        X_transformed_df = pd.DataFrame(np.hstack((X_num, X_cat)), columns=feature_names, index=X.index)

        if y is not None:
            y_transformed = self.target_scaler.transform(np.array(y).reshape(-1, 1)).flatten()
            return X_transformed_df, y_transformed

        return X_transformed_df

    def inverse_transform_outcome(self, y):
        """Reverts the target variable to its original scale."""
        return self.target_scaler.inverse_transform(np.array(y).reshape(-1, 1)).flatten()

    def fit_transform(self, X, y=None):
        """Combines fit and transform steps."""
        self.fit(X, y)
        return self.transform(X, y)

# Prepare training and test datasets
X_train, y_train = train_df.drop(columns=[outcome]), train_df[outcome]
X_test, y_test = test_df.drop(columns=[outcome]), test_df[outcome]

# Initialize and apply preprocessing
preprocessor = UnifiedRegressorPreprocessor()
X_train_transformed, y_train_transformed = preprocessor.fit_transform(X_train, y_train)
X_test_transformed, y_test_transformed = preprocessor.transform(X_test, y_test)

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train_transformed, y_train_transformed)

# Generate predictions and revert scaling
y_pred_transformed = model.predict(X_test_transformed)
y_pred = preprocessor.inverse_transform_outcome(y_pred_transformed)

# Add predictions to test DataFrame
predicted_df = test_df.copy()
predicted_df[f"Predicted {outcome}"] = y_pred

# Display sample predictions
print("\nTest DataFrame with Predictions:")
display(predicted_df.head(10))

# Evaluate model performance
metrics = {
    "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
    "MAE": mean_absolute_error(y_test, y_pred),
    "R2 Score": r2_score(y_test, y_pred),
}

print("\nModel Performance Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.2f}")

Test DataFrame with Predictions:

Model Performance Metrics:
RMSE: 10367.90
MAE: 8185.60
R2 Score: 1.00

# Define the selected features for prediction
selected_features = ['Square_Footage', 'Num_Bedrooms', 'Num_Bathrooms', 'Year_Built', 
                     'Lot_Size', 'Garage_Size', 'Neighborhood_Quality']

# Define variant (modifiable) features
invariants = ['Year_Built']
variants = [feature for feature in selected_features if feature not in invariants]

# Define configurable parameters for counterfactual inference
cf_options = {   
    "variants": variants,  
    "outcome_ordinality": "maximize",  # Desired action 
    "train_data": train_file,
    "preprocessor": preprocessor,
}

# Specify the index of the test record for counterfactual inference
test_index_no = 0  
test_data = predicted_df.loc[[test_index_no]]  

# Perform Inference 
result, optimal_transition_df = ez_cf_inference(
    test_data=test_data,  
    outcome=outcome,  
    selected_features=selected_features,  
    model_info=model,  
    options=cf_options  
)

# Summarizes whether an optimal transition was found.
ez.ez_display_json(result)

{   'success': True,
    'message': 'Optimal transition found',
    'summary': {'Actual Outcome': 868458.17, 'Optimal Outcome': 1058596.79}}

# Details the feature changes needed to achieve the optimal outcome.
ez.ez_display_df(optimal_transition_df)

	Square_Footage	Num_Bedrooms	Num_Bathrooms	Year_Built	Lot_Size	Garage_Size	Neighborhood_Quality	House_Price
0	4235	3	3	2000	1.911679	1	8	917235.410532
1	4006	4	2	2003	1.092441	2	4	871566.562740
2	785	5	3	1995	3.823276	2	3	262707.278933
3	2827	3	1	1977	3.213678	2	4	605143.959115
4	2219	4	1	1965	0.725965	0	4	470083.290367

	Square_Footage	Num_Bedrooms	Num_Bathrooms	Year_Built	Lot_Size	Garage_Size	Neighborhood_Quality	House_Price	Predicted House_Price
0	4012	3	1	2016	2.098092	1	5	9.010005e+05	8.684582e+05
1	2310	3	1	1988	1.369622	1	4	4.945375e+05	4.901319e+05
2	4708	1	3	1962	1.792970	1	8	9.494042e+05	9.456196e+05
3	4932	2	1	1972	4.479598	1	2	1.040389e+06	1.033595e+06
4	3646	1	1	1994	3.980987	0	9	7.940100e+05	7.764987e+05
5	3586	2	2	1964	2.568429	0	10	7.240336e+05	7.323173e+05
6	4638	4	3	2000	1.490399	1	3	9.984392e+05	9.951187e+05
7	4127	5	2	1992	1.026156	2	1	9.097134e+05	8.852059e+05
8	3781	2	1	1989	3.164076	0	9	7.926815e+05	7.965207e+05
9	4243	2	1	2002	4.498088	2	7	9.474908e+05	9.316157e+05

	Feature	Actual	Optimal	Percentage Change	Absolute Change
0	Square_Footage	4012.000000	4814.400000	20.000000	802.400000
1	Num_Bedrooms	3.000000	4.000000	33.300000	1.000000
2	Num_Bathrooms	1.000000	2.000000	100.000000	1.000000
3	Year_Built	2016.000000	2016.000000	0.000000	0.000000
4	Lot_Size	2.100000	2.520000	20.000000	0.420000
5	Garage_Size	1.000000	2.000000	100.000000	1.000000
6	Neighborhood_Quality	5.000000	6.000000	20.000000	1.000000

EazyML Counterfactual Template¶

Define Imports¶

1. Initialize EazyML¶

2. Define Dataset Files and Outcome Variable¶

3. Dataset Information¶

Columns in the Dataset:¶

3.1 Display the Dataset¶

4. Custom Modeling with Scikit-learn¶

4.1 Unified Preprocessing Class for Regression¶

4.2 Train and Evaluate Linear Regression Model¶

5. EazyML Counterfactual Inference¶

5.1 Define Counterfactual Inference Configuration¶

5.2 Perform Counterfactual Inference¶

5.3 Display Results¶