mlessentials/Lab15/Exercise15.07/Exercise_15_07_Ensemble_learning_Stacking.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "ktFt6IzKJkSn"
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "_JoUdRaiIwo4"
   },
   "outputs": [],
   "source": [
    "#Loading data from the Github repository to colab notebook\n",
    "filename = '../Dataset/crx.data'\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 204
    },
    "colab_type": "code",
    "id": "0ZmzTR-CJra-",
    "outputId": "d3930e63-2ba8-4ed5-b587-cd7b4dc209e7"
   },
   "outputs": [],
   "source": [
    "# Loading the data using pandas\n",
    "\n",
    "credData = pd.read_csv(filename,sep=\",\",header = None,na_values = \"?\")\n",
    "credData.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 204
    },
    "colab_type": "code",
    "id": "rXYA47JRKVz-",
    "outputId": "09439927-ebd1-4a6f-adb4-de3a7cb2a55d"
   },
   "outputs": [],
   "source": [
    "# Changing the Classes to 1 & 0\n",
    "credData.loc[credData[15] == '+' , 15] = 1\n",
    "credData.loc[credData[15] == '-' , 15] = 0\n",
    "credData.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "id": "R9-NFhigmokr",
    "outputId": "502dc65e-de17-423a-b2af-7778fd4e76ba"
   },
   "outputs": [],
   "source": [
    "# Dropping all the rows with na values\n",
    "newcred = credData.dropna(axis = 0)\n",
    "newcred.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "vxzQZpXMZZN6"
   },
   "outputs": [],
   "source": [
    "# Seperating the categorical variables to make dummy variables\n",
    "\n",
    "credCat = pd.get_dummies(newcred[[0,3,4,5,6,8,9,11,12]])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "__Gup8InbTmf"
   },
   "outputs": [],
   "source": [
    "# Seperating the numerical variables\n",
    "\n",
    "credNum = newcred[[1,2,7,10,13,14]]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 51
    },
    "colab_type": "code",
    "id": "5S_Icyu1r8YJ",
    "outputId": "4ae45242-019c-46f7-e79f-37451cea8ad1"
   },
   "outputs": [],
   "source": [
    "# Making the X variable which is a concatenation of categorical and numerical data\n",
    "\n",
    "X = pd.concat([credCat,credNum],axis = 1)\n",
    "print(X.shape)\n",
    "\n",
    "# Seperating the label as y variable\n",
    "y = pd.Series(newcred[15], dtype=\"int\")\n",
    "print(y.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 224
    },
    "colab_type": "code",
    "id": "ZflX7J-5GtY_",
    "outputId": "235118e8-e90c-4b51-c073-ba12614510f9"
   },
   "outputs": [],
   "source": [
    "# Normalising the data sets\n",
    "# Import library function\n",
    "from sklearn import preprocessing\n",
    "# Creating the scaling function\n",
    "minmaxScaler = preprocessing.MinMaxScaler()\n",
    "# Transforming with the scaler function\n",
    "X_tran = pd.DataFrame(minmaxScaler.fit_transform(X))\n",
    "# Printing the output\n",
    "X_tran.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "iJiHJ6zWJ9y_"
   },
   "outputs": [],
   "source": [
    "# Splitting the data set to train and test sets\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# Splitting the data into train and test sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(X_tran, y, test_size=0.3, random_state=123)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "n4I7ujXBxnNj"
   },
   "source": [
    "**Stacking**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "TsNS-IAN3urW"
   },
   "outputs": [],
   "source": [
    "# Importing the meta learner and base learners\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "bl1 = KNeighborsClassifier(n_neighbors=5)\n",
    "bl2 = RandomForestClassifier(random_state=123)\n",
    "ml = LogisticRegression(random_state=123)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 71
    },
    "colab_type": "code",
    "id": "Z6BvZ7Jzxmph",
    "outputId": "22028791-1461-406f-9e4d-8fb820e19e11"
   },
   "outputs": [],
   "source": [
    "# Creating the stacking classifier\n",
    "from mlxtend.classifier import StackingClassifier\n",
    "stackclf = StackingClassifier(classifiers=[bl1, bl2], \n",
    "                          meta_classifier=ml)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 105
    },
    "colab_type": "code",
    "id": "LVq5ljcdx4R4",
    "outputId": "b5a6170f-a284-411b-ddbf-bf8ab163f974"
   },
   "outputs": [],
   "source": [
    "# Fitting the model on the training set\n",
    "model = stackclf.fit(X_train, y_train)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "R9kr6DBeyKuX"
   },
   "outputs": [],
   "source": [
    "# Generating predictions on test set\n",
    "pred = model.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 170
    },
    "colab_type": "code",
    "id": "BMFA_SivyVRv",
    "outputId": "68f9115b-28b0-4d9a-bf35-fc72df2e6dda"
   },
   "outputs": [],
   "source": [
    "# Printing the classification report\n",
    "from sklearn.metrics import classification_report\n",
    "print(classification_report(y_test, pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 51
    },
    "colab_type": "code",
    "id": "zTwx5_3GyeoR",
    "outputId": "da4d0c06-b93c-4509-9fdd-3a20fea54323"
   },
   "outputs": [],
   "source": [
    "# Printing the confusion matrix\n",
    "from sklearn.metrics import confusion_matrix\n",
    "print(confusion_matrix(y_test, pred))"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "name": "Exercise 15.07 : Ensemble learning - Stacking",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}