mirror of
https://github.com/fenago/data-science.git
synced 2026-05-05 17:11:52 +00:00
336 lines
7.6 KiB
Plaintext
336 lines
7.6 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "ktFt6IzKJkSn"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import warnings\n",
|
|
"warnings.filterwarnings(\"ignore\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "_JoUdRaiIwo4"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"#Loading data from the Github repository to colab notebook\n",
|
|
"filename = '../Dataset/crx.data'\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 204
|
|
},
|
|
"colab_type": "code",
|
|
"id": "0ZmzTR-CJra-",
|
|
"outputId": "d3930e63-2ba8-4ed5-b587-cd7b4dc209e7"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Loading the data using pandas\n",
|
|
"\n",
|
|
"credData = pd.read_csv(filename,sep=\",\",header = None,na_values = \"?\")\n",
|
|
"credData.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 204
|
|
},
|
|
"colab_type": "code",
|
|
"id": "rXYA47JRKVz-",
|
|
"outputId": "09439927-ebd1-4a6f-adb4-de3a7cb2a55d"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Changing the Classes to 1 & 0\n",
|
|
"credData.loc[credData[15] == '+' , 15] = 1\n",
|
|
"credData.loc[credData[15] == '-' , 15] = 0\n",
|
|
"credData.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 34
|
|
},
|
|
"colab_type": "code",
|
|
"id": "R9-NFhigmokr",
|
|
"outputId": "502dc65e-de17-423a-b2af-7778fd4e76ba"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Dropping all the rows with na values\n",
|
|
"newcred = credData.dropna(axis = 0)\n",
|
|
"newcred.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "vxzQZpXMZZN6"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Seperating the categorical variables to make dummy variables\n",
|
|
"\n",
|
|
"credCat = pd.get_dummies(newcred[[0,3,4,5,6,8,9,11,12]])\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "__Gup8InbTmf"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Seperating the numerical variables\n",
|
|
"\n",
|
|
"credNum = newcred[[1,2,7,10,13,14]]\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 51
|
|
},
|
|
"colab_type": "code",
|
|
"id": "5S_Icyu1r8YJ",
|
|
"outputId": "4ae45242-019c-46f7-e79f-37451cea8ad1"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Making the X variable which is a concatenation of categorical and numerical data\n",
|
|
"\n",
|
|
"X = pd.concat([credCat,credNum],axis = 1)\n",
|
|
"print(X.shape)\n",
|
|
"\n",
|
|
"# Seperating the label as y variable\n",
|
|
"y = pd.Series(newcred[15], dtype=\"int\")\n",
|
|
"print(y.shape)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 224
|
|
},
|
|
"colab_type": "code",
|
|
"id": "ZflX7J-5GtY_",
|
|
"outputId": "235118e8-e90c-4b51-c073-ba12614510f9"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Normalising the data sets\n",
|
|
"# Import library function\n",
|
|
"from sklearn import preprocessing\n",
|
|
"# Creating the scaling function\n",
|
|
"minmaxScaler = preprocessing.MinMaxScaler()\n",
|
|
"# Transforming with the scaler function\n",
|
|
"X_tran = pd.DataFrame(minmaxScaler.fit_transform(X))\n",
|
|
"# Printing the output\n",
|
|
"X_tran.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "iJiHJ6zWJ9y_"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Splitting the data set to train and test sets\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"\n",
|
|
"# Splitting the data into train and test sets\n",
|
|
"X_train, X_test, y_train, y_test = train_test_split(X_tran, y, test_size=0.3, random_state=123)\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text",
|
|
"id": "n4I7ujXBxnNj"
|
|
},
|
|
"source": [
|
|
"**Stacking**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "TsNS-IAN3urW"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Importing the meta learner and base learners\n",
|
|
"from sklearn.linear_model import LogisticRegression\n",
|
|
"from sklearn.neighbors import KNeighborsClassifier\n",
|
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
|
"\n",
|
|
"bl1 = KNeighborsClassifier(n_neighbors=5)\n",
|
|
"bl2 = RandomForestClassifier(random_state=123)\n",
|
|
"ml = LogisticRegression(random_state=123)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 71
|
|
},
|
|
"colab_type": "code",
|
|
"id": "Z6BvZ7Jzxmph",
|
|
"outputId": "22028791-1461-406f-9e4d-8fb820e19e11"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Creating the stacking classifier\n",
|
|
"from mlxtend.classifier import StackingClassifier\n",
|
|
"stackclf = StackingClassifier(classifiers=[bl1, bl2], \n",
|
|
" meta_classifier=ml)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 105
|
|
},
|
|
"colab_type": "code",
|
|
"id": "LVq5ljcdx4R4",
|
|
"outputId": "b5a6170f-a284-411b-ddbf-bf8ab163f974"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Fitting the model on the training set\n",
|
|
"model = stackclf.fit(X_train, y_train)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "R9kr6DBeyKuX"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Generating predictions on test set\n",
|
|
"pred = model.predict(X_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 170
|
|
},
|
|
"colab_type": "code",
|
|
"id": "BMFA_SivyVRv",
|
|
"outputId": "68f9115b-28b0-4d9a-bf35-fc72df2e6dda"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Printing the classification report\n",
|
|
"from sklearn.metrics import classification_report\n",
|
|
"print(classification_report(y_test, pred))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 51
|
|
},
|
|
"colab_type": "code",
|
|
"id": "zTwx5_3GyeoR",
|
|
"outputId": "da4d0c06-b93c-4509-9fdd-3a20fea54323"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Printing the confusion matrix\n",
|
|
"from sklearn.metrics import confusion_matrix\n",
|
|
"print(confusion_matrix(y_test, pred))"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"name": "Exercise 15.07 : Ensemble learning - Stacking",
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 1
|
|
}
|