Predicting faulty water pumps on Tanzanian water points dataset using scikit-learn classifiers
Scikit-learn is a Python module for machine learning that provides a lot of regression, classification, and clustering algorithms.
We will run several classification models and compare their accuracy score on data from Taarifa and the Tanzanian Ministry of Water.
Full code is available on GitHub.
First, let’s import required libraries
import pickle
import plotly.express as px
from scipy import stats
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, plot_confusion_matrix
import warnings
warnings.filterwarnings('ignore')
Then let’s load the dataset
raw_df = pd.read_csv('data/downloaded/train_data.csv')
raw_df.head()
Check and fill na values in categorical features
raw_df_na_cols = raw_df.columns[raw_df.isna().any()].tolist()
print('{0:20}{1:10}{2}\n'.format('Column name', 'Type', 'Filled NA values count'))for col in raw_df_na_cols:
raw_df[col] = raw_df[col].fillna('not_filled')
print('{0:20}{1:10}{2}'.format(
col,
str(raw_df[col].dtype),
len(raw_df[raw_df[col] == "not_filled"])
))
Select features that we will use for model fitting
features = raw_df[[ 'funder', 'gps_height', 'installer',
'longitude', 'latitude', 'basin', 'region', 'population',
'public_meeting', 'scheme_management', 'permit',
'construction_year', 'extraction_type_class', 'management_group',
'payment_type', 'quality_group', 'quantity', 'source_type']]
Load labels
labels = pd.read_csv('data/downloaded/train_labels.csv')
labels = labels.status_group
labels.unique()
Concatenate features and labels
data = pd.concat([features,labels], axis=1)
data.info()
Convert boolean values to string
mask = data.applymap(type) != bool
d = {True: 'yes', False: 'no'}data = data.where(mask, data.replace(d))
Split dataset first to avoid data leakage
train, test = train_test_split(data,random_state=42)
Decrease unique values of selected columns to 100 (set value ‘Other’ for less frequent columns) using a custom helper function
train_funder_col_names, assigned_val, train.funder =
hlp.reduce_cat_num(train.funder,'Other')test.funder = hlp.apply_reduced_cats(
test.funder,train_funder_col_names,assigned_val)train_installer_col_names, assigned_val, train.installer =
hlp.reduce_cat_num(train.installer,'Other')test.installer = hlp.apply_reduced_cats(
test.installer,train_installer_col_names,assigned_val)
Divide dataset into features and labels
y_train = train.status_group
X_train = train.drop('status_group', axis=1)
y_test = test.status_group
X_test = test.drop('status_group', axis=1)
Set up data preprocessor using sklearn’s ColumnTransformer()
numeric_features =['gps_height','longitude','latitude','population']
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(missing_values=0, strategy='median')),
('scaler', StandardScaler())])categorical_features =
X_train.drop(columns=numeric_features, axis=1).columns
categorical_transformer =
OneHotEncoder(handle_unknown='ignore', sparse=False)preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)])
Fit models using a custom helper function
estimators = [
{
'pipeline': Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', BaggingClassifier(DecisionTreeClassifier(random_state=42),random_state=42,n_jobs=-1))
]),
'param_grid': [{
'classifier__base_estimator__max_depth': [23,25, 27],
'classifier__base_estimator__min_samples_split': [4,6,8],
'classifier__base_estimator__min_samples_leaf': [1,2]
}]
},
{
'pipeline': Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', xgb.XGBClassifier(random_state=42,n_jobs=-1))
]),
'param_grid': [{
'classifier__max_depth': [23,25, 27],
'classifier__n_estimators': [100,200],
'classifier__learning_rate': [.01,.1,.5]
}]
},
{
'pipeline': Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(n_jobs=-1,))
]),
'param_grid': [{
'classifier__criterion': ['gini','entropy'],
'classifier__max_depth': [23,25, 27],
'classifier__min_samples_split': [4,6,8],
'classifier__min_samples_leaf': [1,2]
}]
},
{
'pipeline': Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', KNeighborsClassifier(n_jobs=-1))
]),
'param_grid': [{
'classifier__n_neighbors': [3,5,7],
'classifier__weights': ['uniform','distance']
}]
},
{
'pipeline': Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', MLPClassifier(solver='lbfgs', alpha=1, random_state=42, max_iter=1000))
]),
'param_grid': [{
'classifier__max_iter': [1000,1500,2000],
'classifier__alpha': 10.0 ** -np.arange(1, 10),
'classifier__hidden_layer_sizes': [5,10,15]
}]
},
{
'pipeline': Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', AdaBoostClassifier(random_state=42))
]),
'param_grid': [{
'classifier__n_estimators': [100,200],
'classifier__learning_rate': [.01,.1,.5]
}]
},
]models, results = hlp.selectEstimator(estimators,X_train,X_test,y_train,y_test,verbose=True)display(results)
Iterate the previous step with different hyperparameters
estimators = [
{
'pipeline': Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', BaggingClassifier(
DecisionTreeClassifier(random_state=42,max_depth=26,min_samples_split=9),
random_state=42,n_jobs=-1))
]),
},
{
'pipeline': Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', xgb.XGBClassifier(random_state=42,n_jobs=-1,learning_rate=.1))
]),
'param_grid': [{
'classifier__max_depth': [11,13,15],
}]
},
{
'pipeline': Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(n_jobs=-1,criterion='entropy',min_samples_split=4))
]),
'param_grid': [{
'classifier__max_depth': [31,32,33],
}]
},
]models, results = hlp.selectEstimator(estimators,X_train,X_test,y_train,y_test,verbose=True)display(results)
Fit model for submission
with open('data/preprocessed/train_for_SM.pickle', 'rb') as file:
train_for_SM = pickle.load(file)
with open('data/preprocessed/test_for_SM.pickle', 'rb') as file:
test_for_SM = pickle.load(file)y_train_sm = train_for_SM.status_group
X_train_sm = train_for_SM.drop('status_group', axis=1)X_test_sm = test_for_SMpipe = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(
n_jobs=-1,criterion='entropy',
min_samples_split=4,max_depth=32
))
])gridsearch = GridSearchCV(pipe,param_grid=[{}],
scoring='accuracy',cv=5)# Fit the training data
gridsearch.fit(X_train_sm,y_train_sm)# Predict
y_hat = gridsearch.predict(X_test_sm)
submission_format = pd.read_csv(
'data/downloaded/SubmissionFormat.csv')submission_format['status_group'] = y_hatsubmission_format.to_csv(
'data/preprocessed/predicted_test_labels.csv', index=False)
After that load predicted labels to drivendata.org and get our score
I would say it is not a bad result as for classification models.
Full code is available on Github
CONCLUSIONS
We fitted several classification models and get the best result with RandomForestClassifier() and XGBClassifier(). Both behave well after tuning hyperparameters.