mlessentials/Lab13/Lab_13_Unbalanced_Data_sets_v1.0.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "h6UINJEPtGQJ"
   },
   "outputs": [],
   "source": [
    "# Loading the necessary library files\n",
    "import pandas as pd\n",
    "\n",
    "import warnings\n",
    "warnings.simplefilter(action='ignore', category=FutureWarning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 224
    },
    "colab_type": "code",
    "id": "B4bxRZg_tqnp",
    "outputId": "8f916c9e-31cc-437b-eb3c-90746a1ff121"
   },
   "outputs": [],
   "source": [
    "# Please change the filename as per the location where the file is stored\n",
    "filename = './Dataset/bank-full.csv'\n",
    "# Loading the data u'sing pandas\n",
    "\n",
    "bankData = pd.read_csv(filename,sep=\";\")\n",
    "bankData.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "3tLgOzokaNiH"
   },
   "source": [
    "**Feature engineering steps**\n",
    "\n",
    "Let us now do some feature engineering to the data. First we will scale the numerical data and then convert the ordinal data to \n",
    "dummy data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 224
    },
    "colab_type": "code",
    "id": "LjcB4SaWiKp0",
    "outputId": "c58cff7f-1b1d-4613-d9cd-e69f669f247c"
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import RobustScaler\n",
    "\n",
    "rob_scaler = RobustScaler()\n",
    "\n",
    "# Converting each of the columns to scaled version\n",
    "\n",
    "bankData['ageScaled'] = rob_scaler.fit_transform(bankData['age'].values.reshape(-1,1))\n",
    "bankData['balScaled'] = rob_scaler.fit_transform(bankData['balance'].values.reshape(-1,1))\n",
    "bankData['durScaled'] = rob_scaler.fit_transform(bankData['duration'].values.reshape(-1,1))\n",
    "\n",
    "# Dropping the original columns\n",
    "\n",
    "bankData.drop(['age','balance','duration'], axis=1, inplace=True)\n",
    "\n",
    "# Print the head of the data\n",
    "\n",
    "bankData.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "8glxMnxxN8NB"
   },
   "outputs": [],
   "source": [
    "# Converting all the categorical variables to dummy variables\n",
    "bankCat = pd.get_dummies(bankData[['job','marital','education','default','housing','loan','contact','month','poutcome']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "id": "Ge2LlC_1ePzX",
    "outputId": "ef455def-cf0c-4a9c-eb1c-6583a7015579"
   },
   "outputs": [],
   "source": [
    "# Seperating the numerical data\n",
    "bankNum = bankData[['ageScaled','balScaled','day','durScaled','campaign','pdays','previous']]\n",
    "bankNum.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 275
    },
    "colab_type": "code",
    "id": "EsT4u_jDQ7Sv",
    "outputId": "aaff137c-2e31-4555-f1b3-d1ff3750650f"
   },
   "outputs": [],
   "source": [
    "# Merging with the original data frame\n",
    "# Preparing the X variables\n",
    "X = pd.concat([bankCat, bankNum], axis=1)\n",
    "print(X.shape)\n",
    "# Preparing the Y variable\n",
    "Y = bankData['y']\n",
    "print(Y.shape)\n",
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 156
    },
    "colab_type": "code",
    "id": "h8DjrmI53bEt",
    "outputId": "9652aa5c-8518-4b1d-aa50-d9708232d3e9"
   },
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import train_test_split\n",
    "# Splitting the data into train and test sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)\n",
    "# Defining the LogisticRegression function\n",
    "bankModel = LogisticRegression()\n",
    "bankModel.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 221
    },
    "colab_type": "code",
    "id": "5JWHRmRTTloL",
    "outputId": "6f2297bc-531f-47e9-e1c9-c390a634a5ec"
   },
   "outputs": [],
   "source": [
    "pred = bankModel.predict(X_test)\n",
    "print('Accuracy of Logistic regression model prediction on test set: {:.2f}'.format(bankModel.score(X_test, y_test)))\n",
    "\n",
    "# Confusion Matrix for the model\n",
    "from sklearn.metrics import confusion_matrix\n",
    "confusionMatrix = confusion_matrix(y_test, pred)\n",
    "print(confusionMatrix)\n",
    "\n",
    "from sklearn.metrics import classification_report\n",
    "print(classification_report(y_test, pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 85
    },
    "colab_type": "code",
    "id": "6SNINkDUhqe8",
    "outputId": "d639890f-a63a-4f58-fa85-c9a73c084be3"
   },
   "outputs": [],
   "source": [
    "print('Percentage of positive class :',(y_train[y_train=='yes'].value_counts()/len(y_train) ) * 100)\n",
    "print('Percentage of negative class :',(y_train[y_train=='no'].value_counts()/len(y_train) ) * 100)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "WUDWDskEfUux"
   },
   "source": [
    "**Undersampling Method.**\n",
    "\n",
    "In the random undersampling method, we down sample the majority class to the same amount as the minority class to make the data set balanced. Let us see how we can achieve that\n",
    "\n",
    "In this method we first identify the count of  the  minority cases and then undersample the majority cases to be the same as minority cases. \n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "DLad9gfcKCeo"
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# Splitting the data into train and test sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 241
    },
    "colab_type": "code",
    "id": "-Q_AHv77lTwk",
    "outputId": "ffdff3f2-191c-46d0-b996-656ca81a263e"
   },
   "outputs": [],
   "source": [
    "# let us first join the train_x and train_y for ease of operation\n",
    "\n",
    "trainData = pd.concat([X_train,y_train],axis=1)\n",
    "trainData.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 309
    },
    "colab_type": "code",
    "id": "ykfsvEdgfTtt",
    "outputId": "b7efc9db-0e5a-4a4d-b73f-0de5605f1030"
   },
   "outputs": [],
   "source": [
    "# Finding the indexes of the sample data set where the propensity is 'yes'\n",
    "ind = trainData[trainData['y']=='yes'].index\n",
    "print(len(ind))\n",
    "\n",
    "# Seperate the minority classes\n",
    "minData = trainData.loc[ind]\n",
    "print(minData.shape)\n",
    "\n",
    "# Finding indexes of majority class\n",
    "ind1 = trainData[trainData['y']=='no'].index\n",
    "print(len(ind1))\n",
    "# Seperating the majority class\n",
    "majData = trainData.loc[ind1]\n",
    "print(majData.shape)\n",
    "majData.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 258
    },
    "colab_type": "code",
    "id": "f2APbvt3ovEP",
    "outputId": "06598bb1-580f-414e-9bf9-304e02d0fe12"
   },
   "outputs": [],
   "source": [
    "# Take a random sample equal to length of the minority class to make the data set balanced\n",
    "\n",
    "majSample = majData.sample(n=len(ind),random_state = 123)\n",
    "print(majSample.shape)\n",
    "majSample.head()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 258
    },
    "colab_type": "code",
    "id": "kPgQS0ZEp8qV",
    "outputId": "339b0fe3-8e2b-4e45-a1e1-572c53afd697"
   },
   "outputs": [],
   "source": [
    "# Concatinating both data sets and then shuffling the data set\n",
    "\n",
    "balData = pd.concat([minData,majSample],axis = 0)\n",
    "print('balanced data set shape',balData.shape)\n",
    "\n",
    "# Shuffling the data set\n",
    "\n",
    "from sklearn.utils import shuffle\n",
    "\n",
    "balData = shuffle(balData)\n",
    "balData.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 119
    },
    "colab_type": "code",
    "id": "oKqn7nO8pDVX",
    "outputId": "000d829f-1334-47fe-c0c9-a2d6b4d49c87"
   },
   "outputs": [],
   "source": [
    "# Making the new X_train and y_train\n",
    "\n",
    "X_trainNew = balData.iloc[:,0:51]\n",
    "X_trainNew.head()\n",
    "\n",
    "y_trainNew = balData['y']\n",
    "y_trainNew.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 88
    },
    "colab_type": "code",
    "id": "9nXONaFanHR4",
    "outputId": "3084e814-7e54-46ed-999c-2c68048af599"
   },
   "outputs": [],
   "source": [
    "\n",
    "# Defining the LogisticRegression function\n",
    "bankModel1 = LogisticRegression()\n",
    "bankModel1.fit(X_trainNew, y_trainNew)\n",
    "\n",
    "# Predicting on the test\n",
    "pred = bankModel1.predict(X_test)\n",
    "print('Accuracy of Logisticr regression model prediction on test set for balanced data set: {:.2f}'.format(bankModel1.score(X_test, y_test)))\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 51
    },
    "colab_type": "code",
    "id": "-BmM1E58xNEh",
    "outputId": "aecf1a8e-f0f2-4695-b23b-977a759a7584"
   },
   "outputs": [],
   "source": [
    "# Confusion Matrix for the model\n",
    "from sklearn.metrics import confusion_matrix\n",
    "confusionMatrix = confusion_matrix(y_test, pred)\n",
    "print(confusionMatrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 204
    },
    "colab_type": "code",
    "id": "xCidAiJbxjuI",
    "outputId": "20f9e97c-7b9c-4748-c12f-4756af1611b3"
   },
   "outputs": [],
   "source": [
    "# Confusion Matrix for the model\n",
    "from sklearn.metrics import confusion_matrix\n",
    "confusionMatrix = confusion_matrix(y_test, pred)\n",
    "print(confusionMatrix)\n",
    "\n",
    "from sklearn.metrics import classification_report\n",
    "print(classification_report(y_test, pred))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "tVageK142kfy"
   },
   "source": [
    "**Random Over Sampling**\n",
    "\n",
    "Let us now try the over sampling method and find what effect it has on the results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 68
    },
    "colab_type": "code",
    "id": "WPTjlvKdYHd2",
    "outputId": "ecb38d3a-40f2-487c-ab23-4d828149b68f"
   },
   "outputs": [],
   "source": [
    "# Splitting the data into train and test sets\n",
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)\n",
    "\n",
    "print(\"Before OverSampling count of yes: {}\".format(sum(y_train=='yes')))\n",
    "print(\"Before OverSampling count of no: {} \\n\".format(sum(y_train=='no')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 139
    },
    "colab_type": "code",
    "id": "dFVuB-2K83QE",
    "outputId": "64cccac7-64ab-40e8-8357-0f2cfd1e1bd3"
   },
   "outputs": [],
   "source": [
    "import smote_variants as sv\n",
    "import numpy as np\n",
    "\n",
    "# Instantiating the SMOTE class\n",
    "oversampler= sv.SMOTE()\n",
    "\n",
    "# Creating new training set\n",
    "\n",
    "X_train_us, y_train_us = oversampler.sample(np.array(X_train), np.array(y_train))\n",
    "\n",
    "# Shape after oversampling\n",
    "\n",
    "print('After OverSampling, the shape of train_X: {}'.format(X_train_us.shape))\n",
    "print('After OverSampling, the shape of train_y: {} \\n'.format(y_train_us.shape))\n",
    "\n",
    "print(\"After OverSampling, counts of label 'Yes': {}\".format(sum(y_train_us=='yes')))\n",
    "print(\"After OverSampling, counts of label 'no': {}\".format(sum(y_train_us=='no')))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 275
    },
    "colab_type": "code",
    "id": "fd3PNkJE9-WU",
    "outputId": "fa51847b-3346-4028-b21c-741dcd867272"
   },
   "outputs": [],
   "source": [
    "# Training the model with Logistic regression model\n",
    "\n",
    "# Defining the LogisticRegression function\n",
    "\n",
    "bankModel2 = LogisticRegression()\n",
    "\n",
    "bankModel2.fit(X_train_us, y_train_us)\n",
    "\n",
    "# Predicting on the test set\n",
    "pred = bankModel2.predict(X_test)\n",
    "\n",
    "# Printing accuracy \n",
    "print('Accuracy of Logistic regression model prediction on test set for Smote balanced data set: {:.2f}'.format(bankModel2.score(X_test, y_test)))\n",
    "\n",
    "# Confusion Matrix for the model\n",
    "\n",
    "from sklearn.metrics import confusion_matrix\n",
    "confusionMatrix = confusion_matrix(y_test, pred)\n",
    "print(confusionMatrix)\n",
    "\n",
    "# Classification report for the model\n",
    "\n",
    "from sklearn.metrics import classification_report\n",
    "print(classification_report(y_test, pred))\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "XCpFjq2R_72o"
   },
   "source": [
    "**Activity 1**\n",
    "\n",
    "Implementing MSMOTE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 68
    },
    "colab_type": "code",
    "id": "zkKa5ZHmADAI",
    "outputId": "8942ddc4-0d0d-4f2f-e6c5-94035a6a516d"
   },
   "outputs": [],
   "source": [
    "# Splitting the data into train and test sets\n",
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)\n",
    "\n",
    "print(\"Before OverSampling count of yes: {}\".format(sum(y_train=='yes')))\n",
    "print(\"Before OverSampling count of no: {} \\n\".format(sum(y_train=='no')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 139
    },
    "colab_type": "code",
    "id": "xOr_xeTnsXlD",
    "outputId": "7ac1cc57-6956-41a8-e483-f44f820f594c"
   },
   "outputs": [],
   "source": [
    "import smote_variants as sv\n",
    "import numpy as np\n",
    "# Instantiating the SMOTE class\n",
    "oversampler= sv.MSMOTE()\n",
    "# Creating new training sts\n",
    "X_train_us, y_train_us = oversampler.sample(np.array(X_train), np.array(y_train))\n",
    "\n",
    "# Shape after oversampling\n",
    "print('After OverSampling, the shape of train_X: {}'.format(X_train_us.shape))\n",
    "print('After OverSampling, the shape of train_y: {} \\n'.format(y_train_us.shape))\n",
    "\n",
    "print(\"After OverSampling, counts of label 'Yes': {}\".format(sum(y_train_us=='yes')))\n",
    "print(\"After OverSampling, counts of label 'no': {}\".format(sum(y_train_us=='no')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 275
    },
    "colab_type": "code",
    "id": "c-_FD6bGmu4S",
    "outputId": "6570402c-5e1a-4ed6-8ff5-4d9f4c59cf0d"
   },
   "outputs": [],
   "source": [
    "# Fitting model\n",
    "\n",
    "# Training the model with Logistic regression model\n",
    "\n",
    "# Defining the LogisticRegression function\n",
    "bankModel2 = LogisticRegression()\n",
    "bankModel2.fit(X_train_us, y_train_us)\n",
    "\n",
    "# Predicting on the test\n",
    "pred = bankModel2.predict(X_test)\n",
    "print('Accuracy of Logistic regression model prediction on test set for Smote balanced data set: {:.2f}'.format(bankModel2.score(X_test, y_test)))\n",
    "\n",
    "# Confusion Matrix for the model\n",
    "from sklearn.metrics import confusion_matrix\n",
    "confusionMatrix = confusion_matrix(y_test, pred)\n",
    "print(confusionMatrix)\n",
    "\n",
    "from sklearn.metrics import classification_report\n",
    "print(classification_report(y_test, pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "Copy of Chapter 13 - Unbalanced Data sets",
   "provenance": [],
   "version": "0.3.2"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}