Predicting faulty water pumps on Tanzanian water points dataset using scikit-learn classifiers

Vadym Byesyedin
4 min readMay 12, 2021

--

Scikit-learn is a Python module for machine learning that provides a lot of regression, classification, and clustering algorithms.

We will run several classification models and compare their accuracy score on data from Taarifa and the Tanzanian Ministry of Water.

Full code is available on GitHub.

First, let’s import required libraries

import pickle
import plotly.express as px
from scipy import stats
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, plot_confusion_matrix
import warnings
warnings.filterwarnings('ignore')

Then let’s load the dataset

raw_df = pd.read_csv('data/downloaded/train_data.csv')
raw_df.head()

Check and fill na values in categorical features

raw_df_na_cols = raw_df.columns[raw_df.isna().any()].tolist()
print('{0:20}{1:10}{2}\n'.format('Column name', 'Type', 'Filled NA values count'))
for col in raw_df_na_cols:
raw_df[col] = raw_df[col].fillna('not_filled')
print('{0:20}{1:10}{2}'.format(
col,
str(raw_df[col].dtype),
len(raw_df[raw_df[col] == "not_filled"])
))

Select features that we will use for model fitting

features = raw_df[[ 'funder', 'gps_height', 'installer',     
'longitude', 'latitude', 'basin', 'region', 'population',
'public_meeting', 'scheme_management', 'permit',
'construction_year', 'extraction_type_class', 'management_group',
'payment_type', 'quality_group', 'quantity', 'source_type']]

Load labels

labels = pd.read_csv('data/downloaded/train_labels.csv')
labels = labels.status_group
labels.unique()

Concatenate features and labels

data = pd.concat([features,labels], axis=1)
data.info()

Convert boolean values to string

mask = data.applymap(type) != bool
d = {True: 'yes', False: 'no'}
data = data.where(mask, data.replace(d))

Split dataset first to avoid data leakage

train, test = train_test_split(data,random_state=42)

Decrease unique values of selected columns to 100 (set value ‘Other’ for less frequent columns) using a custom helper function

train_funder_col_names, assigned_val, train.funder =     
hlp.reduce_cat_num(train.funder,'Other')
test.funder = hlp.apply_reduced_cats(
test.funder,train_funder_col_names,assigned_val)
train_installer_col_names, assigned_val, train.installer =
hlp.reduce_cat_num(train.installer,'Other')
test.installer = hlp.apply_reduced_cats(
test.installer,train_installer_col_names,assigned_val)

Divide dataset into features and labels

y_train = train.status_group
X_train = train.drop('status_group', axis=1)
y_test = test.status_group
X_test = test.drop('status_group', axis=1)

Set up data preprocessor using sklearn’s ColumnTransformer()

numeric_features =['gps_height','longitude','latitude','population']
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(missing_values=0, strategy='median')),
('scaler', StandardScaler())])
categorical_features =
X_train.drop(columns=numeric_features, axis=1).columns

categorical_transformer =
OneHotEncoder(handle_unknown='ignore', sparse=False)
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)])

Fit models using a custom helper function

estimators = [
{
'pipeline': Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', BaggingClassifier(DecisionTreeClassifier(random_state=42),random_state=42,n_jobs=-1))
]),
'param_grid': [{
'classifier__base_estimator__max_depth': [23,25, 27],
'classifier__base_estimator__min_samples_split': [4,6,8],
'classifier__base_estimator__min_samples_leaf': [1,2]
}]
},
{
'pipeline': Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', xgb.XGBClassifier(random_state=42,n_jobs=-1))
]),
'param_grid': [{
'classifier__max_depth': [23,25, 27],
'classifier__n_estimators': [100,200],
'classifier__learning_rate': [.01,.1,.5]
}]
},
{
'pipeline': Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(n_jobs=-1,))
]),
'param_grid': [{
'classifier__criterion': ['gini','entropy'],
'classifier__max_depth': [23,25, 27],
'classifier__min_samples_split': [4,6,8],
'classifier__min_samples_leaf': [1,2]
}]
},
{
'pipeline': Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', KNeighborsClassifier(n_jobs=-1))
]),
'param_grid': [{
'classifier__n_neighbors': [3,5,7],
'classifier__weights': ['uniform','distance']
}]
},
{
'pipeline': Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', MLPClassifier(solver='lbfgs', alpha=1, random_state=42, max_iter=1000))
]),
'param_grid': [{
'classifier__max_iter': [1000,1500,2000],
'classifier__alpha': 10.0 ** -np.arange(1, 10),
'classifier__hidden_layer_sizes': [5,10,15]
}]
},
{
'pipeline': Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', AdaBoostClassifier(random_state=42))
]),
'param_grid': [{
'classifier__n_estimators': [100,200],
'classifier__learning_rate': [.01,.1,.5]
}]
},
]
models, results = hlp.selectEstimator(estimators,X_train,X_test,y_train,y_test,verbose=True)display(results)

Iterate the previous step with different hyperparameters

estimators = [
{
'pipeline': Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', BaggingClassifier(
DecisionTreeClassifier(random_state=42,max_depth=26,min_samples_split=9),
random_state=42,n_jobs=-1))
]),
},
{
'pipeline': Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', xgb.XGBClassifier(random_state=42,n_jobs=-1,learning_rate=.1))
]),
'param_grid': [{
'classifier__max_depth': [11,13,15],
}]
},
{
'pipeline': Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(n_jobs=-1,criterion='entropy',min_samples_split=4))
]),
'param_grid': [{
'classifier__max_depth': [31,32,33],
}]
},
]
models, results = hlp.selectEstimator(estimators,X_train,X_test,y_train,y_test,verbose=True)display(results)

Fit model for submission

with open('data/preprocessed/train_for_SM.pickle', 'rb') as file:
train_for_SM = pickle.load(file)

with open('data/preprocessed/test_for_SM.pickle', 'rb') as file:
test_for_SM = pickle.load(file)
y_train_sm = train_for_SM.status_group
X_train_sm = train_for_SM.drop('status_group', axis=1)
X_test_sm = test_for_SMpipe = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(
n_jobs=-1,criterion='entropy',
min_samples_split=4,max_depth=32
))
])
gridsearch = GridSearchCV(pipe,param_grid=[{}],
scoring='accuracy',cv=5)
# Fit the training data
gridsearch.fit(X_train_sm,y_train_sm)
# Predict
y_hat = gridsearch.predict(X_test_sm)

submission_format = pd.read_csv(
'data/downloaded/SubmissionFormat.csv')
submission_format['status_group'] = y_hatsubmission_format.to_csv(
'data/preprocessed/predicted_test_labels.csv', index=False)

After that load predicted labels to drivendata.org and get our score

I would say it is not a bad result as for classification models.

Full code is available on Github

CONCLUSIONS

We fitted several classification models and get the best result with RandomForestClassifier() and XGBClassifier(). Both behave well after tuning hyperparameters.

--

--

No responses yet