mlessentials/Lab02/All_Code/Chapter_2_Regression.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "o_xWRbP9oeJo"
   },
   "source": [
    "### **Exercise 2.01: Loading and preparing the data for analysis**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "S_e0RsA-fBKd"
   },
   "outputs": [],
   "source": [
    "# Import necessary modules\n",
    "\n",
    "%matplotlib inline\n",
    "import matplotlib as mpl\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import statsmodels.formula.api as smf\n",
    "import statsmodels.graphics.api as smg\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import patsy\n",
    "from statsmodels.graphics.correlation import plot_corr\n",
    "from sklearn.model_selection import train_test_split\n",
    "plt.style.use('seaborn')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the dataset into a pandas dataframe\n",
    "\n",
    "rawBostonData = pd.read_csv('../Dataset/Boston.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 204
    },
    "colab_type": "code",
    "id": "YEO5woBrpYci",
    "outputId": "8eb44eb8-87a6-432a-cf97-a01e0727d68a"
   },
   "outputs": [],
   "source": [
    "# Inspect the dataframe\n",
    "\n",
    "rawBostonData.head() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "Zz23WUNNpx6c"
   },
   "outputs": [],
   "source": [
    "# Drop missing values from the dataframe\n",
    "\n",
    "rawBostonData = rawBostonData.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "gDCUyRscp43W"
   },
   "outputs": [],
   "source": [
    "# Drop duplicate records from the dataframe\n",
    "\n",
    "rawBostonData = rawBostonData.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 238
    },
    "colab_type": "code",
    "id": "Q9GyOY_9p5Pt",
    "outputId": "bcae4407-2c54-4027-9fff-48de3b803c3d"
   },
   "outputs": [],
   "source": [
    "# List the dataframe column names\n",
    "\n",
    "list(rawBostonData.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "BFydSue8qKaZ"
   },
   "outputs": [],
   "source": [
    "# Rename the dataframe column headings\n",
    "\n",
    "renamedBostonData = rawBostonData.rename(columns = {'CRIM':'crimeRatePerCapita',\n",
    " ' ZN ':'landOver25K_sqft',\n",
    " 'INDUS ':'non-retailLandProptn',\n",
    " 'CHAS':'riverDummy',\n",
    " 'NOX':'nitrixOxide_pp10m',\n",
    " 'RM':'AvgNo.RoomsPerDwelling',\n",
    " 'AGE':'ProptnOwnerOccupied',\n",
    " 'DIS':'weightedDist',\n",
    " 'RAD':'radialHighwaysAccess',\n",
    " 'TAX':'propTaxRate_per10K',\n",
    " 'PTRATIO':'pupilTeacherRatio',\n",
    " 'LSTAT':'pctLowerStatus',\n",
    " 'MEDV':'medianValue_Ks'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 323
    },
    "colab_type": "code",
    "id": "792LNU-3qKn9",
    "outputId": "63f9cca7-24ed-42dd-ab07-d43a29499393"
   },
   "outputs": [],
   "source": [
    "# Inspect the types of data in the dataframe\n",
    "\n",
    "renamedBostonData.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 452
    },
    "colab_type": "code",
    "id": "ZEyhe_JeqKyg",
    "outputId": "9dda9806-e319-45e5-a3e5-06df3ed90550"
   },
   "outputs": [],
   "source": [
    "# Calculate basic statistics using the values in the dataframe\n",
    "\n",
    "renamedBostonData.describe(include=[np.number]).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "n5wmtur8rXFG"
   },
   "outputs": [],
   "source": [
    "# Split the data set into training and test sets\n",
    "\n",
    "X = renamedBostonData.drop('crimeRatePerCapita', axis = 1)\n",
    "y = renamedBostonData[['crimeRatePerCapita']]\n",
    "seed = 10 \n",
    "test_data_size = 0.3 \n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_data_size, random_state = seed)\n",
    "train_data = pd.concat([X_train, y_train], axis = 1)\n",
    "test_data = pd.concat([X_test, y_test], axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 793
    },
    "colab_type": "code",
    "id": "G8w04aBtrXWQ",
    "outputId": "a0751f02-6f76-43e6-e0e1-1c522255c60f"
   },
   "outputs": [],
   "source": [
    "# Create and plot a correlation matrix\n",
    "\n",
    "corrMatrix = train_data.corr(method = 'pearson')\n",
    "xnames=list(train_data.columns)\n",
    "ynames=list(train_data.columns)\n",
    "plot_corr(corrMatrix, xnames=xnames, ynames=ynames,\\\n",
    "          title=None, normcolor=False, cmap='RdYlBu_r')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "aWfnyCJhr7SP"
   },
   "source": [
    "### **Exercise 2.02: Graphical investigation of linear relationships using Python**\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 441
    },
    "colab_type": "code",
    "id": "pZMrvpvirkMa",
    "outputId": "4b3a1921-82cf-449a-c57a-6fee9d804b20"
   },
   "outputs": [],
   "source": [
    "# Use the seaborn function regplot to create a scatter plot and fit a regression line through it\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(10, 6))\n",
    "sns.regplot(x='medianValue_Ks', y='crimeRatePerCapita', ci=None,\n",
    "data=train_data, ax=ax, color='k', scatter_kws={\"s\": 20,\"color\":\\\n",
    "\"royalblue\", \"alpha\":1})\n",
    "ax.set_ylabel('Crime rate per Capita', fontsize=15, fontname='DejaVu Sans')\n",
    "ax.set_xlabel(\"Median value of owner-occupied homes in $1000's\",\\\n",
    "fontsize=15, fontname='DejaVu Sans')\n",
    "ax.set_xlim(left=None, right=None)\n",
    "ax.set_ylim(bottom=None, top=30)\n",
    "ax.tick_params(axis='both', which='major', labelsize=12)\n",
    "fig.tight_layout()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "Ncyc8Kz9uvC6"
   },
   "source": [
    "### **Exercise 2.03: Examining a possible log-linear relationship using Python**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 441
    },
    "colab_type": "code",
    "id": "P1FzdmrsrkcU",
    "outputId": "15c55406-d4d6-44e0-ee17-54213bc6ad9c"
   },
   "outputs": [],
   "source": [
    "# Use the seaborn function regplot to create a log-linear plot and fit a regression line through it\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(10, 6))\n",
    "y = np.log(train_data['crimeRatePerCapita'])\n",
    "sns.regplot(x='medianValue_Ks', y=y, ci=95, data=train_data, ax=ax,\\\n",
    "color='k', scatter_kws={\"s\": 20,\"color\": \"royalblue\", \"alpha\":1})\n",
    "ax.set_ylabel('log of Crime rate per Capita', fontsize=15,\\\n",
    "fontname='DejaVu Sans')\n",
    "ax.set_xlabel(\"Median value of owner-occupied homes in $1000's\",\\\n",
    "fontsize=15, fontname='DejaVu Sans')\n",
    "ax.set_xlim(left=None, right=None)\n",
    "ax.set_ylim(bottom=None, top=None)\n",
    "ax.tick_params(axis='both', which='major', labelsize=12)\n",
    "fig.tight_layout()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "LvamFp1BzZk2"
   },
   "source": [
    "### **Exercise 2.04: Fit a simple linear regression model using the Statsmodels formula API**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 442
    },
    "colab_type": "code",
    "id": "eG0rTO_FrkqI",
    "outputId": "311c1272-3fad-4020-f45e-9d55a737533e"
   },
   "outputs": [],
   "source": [
    "# Use the statsmodels API to create a simple linear regression\n",
    "\n",
    "linearModel = smf.ols(formula='crimeRatePerCapita ~ medianValue_Ks',\\\n",
    "data=train_data)\n",
    "linearModelResult = linearModel.fit()\n",
    "print(linearModelResult.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "MnnGgmBh0G2I"
   },
   "source": [
    "### **Activity 2.01: Fit a log-linear model using the Statsmodels formula API**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 442
    },
    "colab_type": "code",
    "id": "PFG3qNdQzRRR",
    "outputId": "fb499a61-9a16-40be-e97c-d4f46a3b13a8"
   },
   "outputs": [],
   "source": [
    "# Use the statsmodels API to create a log-linear regression model\n",
    "\n",
    "logLinearModel = smf.ols(formula='np.log(crimeRatePerCapita) ~ medianValue_Ks',\\\n",
    "data=train_data)\n",
    "logLinearModResult = logLinearModel.fit()\n",
    "print(logLinearModResult.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "bySbtbZi2ApH"
   },
   "source": [
    "### **Exercise 2.05: Fit a multiple linear regression model using the Statsmodels formula API**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 493
    },
    "colab_type": "code",
    "id": "cEuWdnJ0zRgM",
    "outputId": "536d1a68-f07a-40eb-c965-effacecf9e1e"
   },
   "outputs": [],
   "source": [
    "# Use the statsmodels API to create a multiple linear regression model\n",
    "\n",
    "multiLinearModel = smf.ols(formula=\\\n",
    "'crimeRatePerCapita ~ pctLowerStatus + radialHighwaysAccess +\\\n",
    "medianValue_Ks + nitrixOxide_pp10m', data=train_data)\n",
    "multiLinearModResult = multiLinearModel.fit()\n",
    "print(multiLinearModResult.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "2bjo67Vj2kyj"
   },
   "source": [
    "### **Activity 2.02: Fit a multiple log-linear regression model**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 629
    },
    "colab_type": "code",
    "id": "2P3Ta9Atp5eU",
    "outputId": "de0bc877-2a98-4eb0-ee8d-76c51c370087"
   },
   "outputs": [],
   "source": [
    "# Use the statsmodels API to create a multiple log-linear regression model\n",
    "\n",
    "multiLogLinMod = smf.ols(formula=\\\n",
    "'np.log(crimeRatePerCapita) ~ \\\n",
    "(pctLowerStatus + radialHighwaysAccess + medianValue_Ks + nitrixOxide_pp10m)**2',\\\n",
    "data=train_data)\n",
    "multiLogLinModResult = multiLogLinMod.fit()\n",
    "print(multiLogLinModResult.summary())"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "Chapter two - Regression.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}