mirror of
https://github.com/fenago/data-science.git
synced 2026-05-04 08:31:59 +00:00
231 lines
5.2 KiB
Plaintext
231 lines
5.2 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "GalWZL42gUJL"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"%matplotlib inline\n",
|
|
"import matplotlib as mpl\n",
|
|
"import seaborn as sns\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import statsmodels.formula.api as smf\n",
|
|
"import statsmodels.graphics.api as smg\n",
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import patsy\n",
|
|
"from statsmodels.graphics.correlation import plot_corr\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"plt.style.use('seaborn')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "6_lKgFqPgeyk"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"rawBostonData = pd.read_csv('../Dataset/Boston.csv')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 204
|
|
},
|
|
"colab_type": "code",
|
|
"id": "yB1hG1WKglOM",
|
|
"outputId": "2030133d-d616-4059-f647-da075e03ed12"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"rawBostonData.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "1krtNoLygsJx"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"rawBostonData = rawBostonData.dropna()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "Eyxd8QXwgw_d"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"rawBostonData = rawBostonData.drop_duplicates()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 238
|
|
},
|
|
"colab_type": "code",
|
|
"id": "1Z62CWO_gzc5",
|
|
"outputId": "8868804a-e4f7-41fa-c07f-6b759bd63eb5"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"list(rawBostonData.columns)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 241
|
|
},
|
|
"colab_type": "code",
|
|
"id": "EB7FBISzg2yx",
|
|
"outputId": "92c8bde0-5493-442a-ee48-8eefdc48aaba"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"renamedBostonData = rawBostonData.rename(columns = {'CRIM':'crimeRatePerCapita',\n",
|
|
" ' ZN ':'landOver25K_sqft',\n",
|
|
" 'INDUS ':'non-retailLandProptn',\n",
|
|
" 'CHAS':'riverDummy',\n",
|
|
" 'NOX':'nitrixOxide_pp10m',\n",
|
|
" 'RM':'AvgNo.RoomsPerDwelling',\n",
|
|
" 'AGE':'ProptnOwnerOccupied',\n",
|
|
" 'DIS':'weightedDist',\n",
|
|
" 'RAD':'radialHighwaysAccess',\n",
|
|
" 'TAX':'propTaxRate_per10K',\n",
|
|
" 'PTRATIO':'pupilTeacherRatio',\n",
|
|
" 'LSTAT':'pctLowerStatus',\n",
|
|
" 'MEDV':'medianValue_Ks'})\n",
|
|
"renamedBostonData.head()\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 323
|
|
},
|
|
"colab_type": "code",
|
|
"id": "N_cmLg_5g56i",
|
|
"outputId": "f11f011c-9d31-4ff5-a03b-e380356fc2a3"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"renamedBostonData.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 452
|
|
},
|
|
"colab_type": "code",
|
|
"id": "X8khkdpjgv-K",
|
|
"outputId": "478bc4de-75c2-413b-dc1c-7156d8bd7c82"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"renamedBostonData.describe(include=[np.number]).T"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "oO_etrVcg-gV"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"X = renamedBostonData.drop('crimeRatePerCapita', axis = 1)\n",
|
|
"y = renamedBostonData[['crimeRatePerCapita']]\n",
|
|
"seed = 10 \n",
|
|
"test_data_size = 0.3 \n",
|
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_data_size, random_state = seed)\n",
|
|
"train_data = pd.concat([X_train, y_train], axis = 1)\n",
|
|
"test_data = pd.concat([X_test, y_test], axis = 1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 793
|
|
},
|
|
"colab_type": "code",
|
|
"id": "dwCbKxJqhBee",
|
|
"outputId": "d1604316-dc31-4a5f-af82-75cfc8b70faf"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"corrMatrix = train_data.corr(method = 'pearson')\n",
|
|
"xnames=list(train_data.columns)\n",
|
|
"ynames=list(train_data.columns)\n",
|
|
"plot_corr(corrMatrix, xnames=xnames, ynames=ynames,\\\n",
|
|
" title=None, normcolor=False, cmap='RdYlBu_r')"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"name": "Exercise2.01.ipynb",
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 1
|
|
}
|