mirror of
https://github.com/fenago/data-science.git
synced 2026-05-04 16:41:05 +00:00
468 lines
12 KiB
Plaintext
468 lines
12 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text",
|
|
"id": "o_xWRbP9oeJo"
|
|
},
|
|
"source": [
|
|
"### **Exercise 2.01: Loading and preparing the data for analysis**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "S_e0RsA-fBKd"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Import necessary modules\n",
|
|
"\n",
|
|
"%matplotlib inline\n",
|
|
"import matplotlib as mpl\n",
|
|
"import seaborn as sns\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import statsmodels.formula.api as smf\n",
|
|
"import statsmodels.graphics.api as smg\n",
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import patsy\n",
|
|
"from statsmodels.graphics.correlation import plot_corr\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"plt.style.use('seaborn')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Load the dataset into a pandas dataframe\n",
|
|
"\n",
|
|
"rawBostonData = pd.read_csv('../Dataset/Boston.csv')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 204
|
|
},
|
|
"colab_type": "code",
|
|
"id": "YEO5woBrpYci",
|
|
"outputId": "8eb44eb8-87a6-432a-cf97-a01e0727d68a"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Inspect the dataframe\n",
|
|
"\n",
|
|
"rawBostonData.head() "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "Zz23WUNNpx6c"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Drop missing values from the dataframe\n",
|
|
"\n",
|
|
"rawBostonData = rawBostonData.dropna()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "gDCUyRscp43W"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Drop duplicate records from the dataframe\n",
|
|
"\n",
|
|
"rawBostonData = rawBostonData.drop_duplicates()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 238
|
|
},
|
|
"colab_type": "code",
|
|
"id": "Q9GyOY_9p5Pt",
|
|
"outputId": "bcae4407-2c54-4027-9fff-48de3b803c3d"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# List the dataframe column names\n",
|
|
"\n",
|
|
"list(rawBostonData.columns)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "BFydSue8qKaZ"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Rename the dataframe column headings\n",
|
|
"\n",
|
|
"renamedBostonData = rawBostonData.rename(columns = {'CRIM':'crimeRatePerCapita',\n",
|
|
" ' ZN ':'landOver25K_sqft',\n",
|
|
" 'INDUS ':'non-retailLandProptn',\n",
|
|
" 'CHAS':'riverDummy',\n",
|
|
" 'NOX':'nitrixOxide_pp10m',\n",
|
|
" 'RM':'AvgNo.RoomsPerDwelling',\n",
|
|
" 'AGE':'ProptnOwnerOccupied',\n",
|
|
" 'DIS':'weightedDist',\n",
|
|
" 'RAD':'radialHighwaysAccess',\n",
|
|
" 'TAX':'propTaxRate_per10K',\n",
|
|
" 'PTRATIO':'pupilTeacherRatio',\n",
|
|
" 'LSTAT':'pctLowerStatus',\n",
|
|
" 'MEDV':'medianValue_Ks'})"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 323
|
|
},
|
|
"colab_type": "code",
|
|
"id": "792LNU-3qKn9",
|
|
"outputId": "63f9cca7-24ed-42dd-ab07-d43a29499393"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Inspect the types of data in the dataframe\n",
|
|
"\n",
|
|
"renamedBostonData.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 452
|
|
},
|
|
"colab_type": "code",
|
|
"id": "ZEyhe_JeqKyg",
|
|
"outputId": "9dda9806-e319-45e5-a3e5-06df3ed90550"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Calculate basic statistics using the values in the dataframe\n",
|
|
"\n",
|
|
"renamedBostonData.describe(include=[np.number]).T"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "n5wmtur8rXFG"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Split the data set into training and test sets\n",
|
|
"\n",
|
|
"X = renamedBostonData.drop('crimeRatePerCapita', axis = 1)\n",
|
|
"y = renamedBostonData[['crimeRatePerCapita']]\n",
|
|
"seed = 10 \n",
|
|
"test_data_size = 0.3 \n",
|
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_data_size, random_state = seed)\n",
|
|
"train_data = pd.concat([X_train, y_train], axis = 1)\n",
|
|
"test_data = pd.concat([X_test, y_test], axis = 1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 793
|
|
},
|
|
"colab_type": "code",
|
|
"id": "G8w04aBtrXWQ",
|
|
"outputId": "a0751f02-6f76-43e6-e0e1-1c522255c60f"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Create and plot a correlation matrix\n",
|
|
"\n",
|
|
"corrMatrix = train_data.corr(method = 'pearson')\n",
|
|
"xnames=list(train_data.columns)\n",
|
|
"ynames=list(train_data.columns)\n",
|
|
"plot_corr(corrMatrix, xnames=xnames, ynames=ynames,\\\n",
|
|
" title=None, normcolor=False, cmap='RdYlBu_r')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text",
|
|
"id": "aWfnyCJhr7SP"
|
|
},
|
|
"source": [
|
|
"### **Exercise 2.02: Graphical investigation of linear relationships using Python**\n",
|
|
"\n",
|
|
"\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 441
|
|
},
|
|
"colab_type": "code",
|
|
"id": "pZMrvpvirkMa",
|
|
"outputId": "4b3a1921-82cf-449a-c57a-6fee9d804b20"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Use the seaborn function regplot to create a scatter plot and fit a regression line through it\n",
|
|
"\n",
|
|
"fig, ax = plt.subplots(figsize=(10, 6))\n",
|
|
"sns.regplot(x='medianValue_Ks', y='crimeRatePerCapita', ci=None,\n",
|
|
"data=train_data, ax=ax, color='k', scatter_kws={\"s\": 20,\"color\":\\\n",
|
|
"\"royalblue\", \"alpha\":1})\n",
|
|
"ax.set_ylabel('Crime rate per Capita', fontsize=15, fontname='DejaVu Sans')\n",
|
|
"ax.set_xlabel(\"Median value of owner-occupied homes in $1000's\",\\\n",
|
|
"fontsize=15, fontname='DejaVu Sans')\n",
|
|
"ax.set_xlim(left=None, right=None)\n",
|
|
"ax.set_ylim(bottom=None, top=30)\n",
|
|
"ax.tick_params(axis='both', which='major', labelsize=12)\n",
|
|
"fig.tight_layout()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text",
|
|
"id": "Ncyc8Kz9uvC6"
|
|
},
|
|
"source": [
|
|
"### **Exercise 2.03: Examining a possible log-linear relationship using Python**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 441
|
|
},
|
|
"colab_type": "code",
|
|
"id": "P1FzdmrsrkcU",
|
|
"outputId": "15c55406-d4d6-44e0-ee17-54213bc6ad9c"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Use the seaborn function regplot to create a log-linear plot and fit a regression line through it\n",
|
|
"\n",
|
|
"fig, ax = plt.subplots(figsize=(10, 6))\n",
|
|
"y = np.log(train_data['crimeRatePerCapita'])\n",
|
|
"sns.regplot(x='medianValue_Ks', y=y, ci=95, data=train_data, ax=ax,\\\n",
|
|
"color='k', scatter_kws={\"s\": 20,\"color\": \"royalblue\", \"alpha\":1})\n",
|
|
"ax.set_ylabel('log of Crime rate per Capita', fontsize=15,\\\n",
|
|
"fontname='DejaVu Sans')\n",
|
|
"ax.set_xlabel(\"Median value of owner-occupied homes in $1000's\",\\\n",
|
|
"fontsize=15, fontname='DejaVu Sans')\n",
|
|
"ax.set_xlim(left=None, right=None)\n",
|
|
"ax.set_ylim(bottom=None, top=None)\n",
|
|
"ax.tick_params(axis='both', which='major', labelsize=12)\n",
|
|
"fig.tight_layout()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text",
|
|
"id": "LvamFp1BzZk2"
|
|
},
|
|
"source": [
|
|
"### **Exercise 2.04: Fit a simple linear regression model using the Statsmodels formula API**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 442
|
|
},
|
|
"colab_type": "code",
|
|
"id": "eG0rTO_FrkqI",
|
|
"outputId": "311c1272-3fad-4020-f45e-9d55a737533e"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Use the statsmodels API to create a simple linear regression\n",
|
|
"\n",
|
|
"linearModel = smf.ols(formula='crimeRatePerCapita ~ medianValue_Ks',\\\n",
|
|
"data=train_data)\n",
|
|
"linearModelResult = linearModel.fit()\n",
|
|
"print(linearModelResult.summary())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text",
|
|
"id": "MnnGgmBh0G2I"
|
|
},
|
|
"source": [
|
|
"### **Activity 2.01: Fit a log-linear model using the Statsmodels formula API**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 442
|
|
},
|
|
"colab_type": "code",
|
|
"id": "PFG3qNdQzRRR",
|
|
"outputId": "fb499a61-9a16-40be-e97c-d4f46a3b13a8"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Use the statsmodels API to create a log-linear regression model\n",
|
|
"\n",
|
|
"logLinearModel = smf.ols(formula='np.log(crimeRatePerCapita) ~ medianValue_Ks',\\\n",
|
|
"data=train_data)\n",
|
|
"logLinearModResult = logLinearModel.fit()\n",
|
|
"print(logLinearModResult.summary())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text",
|
|
"id": "bySbtbZi2ApH"
|
|
},
|
|
"source": [
|
|
"### **Exercise 2.05: Fit a multiple linear regression model using the Statsmodels formula API**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 493
|
|
},
|
|
"colab_type": "code",
|
|
"id": "cEuWdnJ0zRgM",
|
|
"outputId": "536d1a68-f07a-40eb-c965-effacecf9e1e"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Use the statsmodels API to create a multiple linear regression model\n",
|
|
"\n",
|
|
"multiLinearModel = smf.ols(formula=\\\n",
|
|
"'crimeRatePerCapita ~ pctLowerStatus + radialHighwaysAccess +\\\n",
|
|
"medianValue_Ks + nitrixOxide_pp10m', data=train_data)\n",
|
|
"multiLinearModResult = multiLinearModel.fit()\n",
|
|
"print(multiLinearModResult.summary())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text",
|
|
"id": "2bjo67Vj2kyj"
|
|
},
|
|
"source": [
|
|
"### **Activity 2.02: Fit a multiple log-linear regression model**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 629
|
|
},
|
|
"colab_type": "code",
|
|
"id": "2P3Ta9Atp5eU",
|
|
"outputId": "de0bc877-2a98-4eb0-ee8d-76c51c370087"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Use the statsmodels API to create a multiple log-linear regression model\n",
|
|
"\n",
|
|
"multiLogLinMod = smf.ols(formula=\\\n",
|
|
"'np.log(crimeRatePerCapita) ~ \\\n",
|
|
"(pctLowerStatus + radialHighwaysAccess + medianValue_Ks + nitrixOxide_pp10m)**2',\\\n",
|
|
"data=train_data)\n",
|
|
"multiLogLinModResult = multiLogLinMod.fit()\n",
|
|
"print(multiLogLinModResult.summary())"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"collapsed_sections": [],
|
|
"name": "Chapter two - Regression.ipynb",
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 1
|
|
}
|