Files
mlessentials/RegressionDemo.ipynb
2022-07-25 11:07:52 -07:00

1 line
19 KiB
Plaintext

{"metadata":{"orig_nbformat":4,"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.10"}},"nbformat_minor":5,"nbformat":4,"cells":[{"cell_type":"code","source":"#Import your Libraries\nimport numpy as np\nimport pandas as pd\nfrom matplotlib import pyplot as plt\nimport seaborn as sns\nfrom sklearn.linear_model import LinearRegression\nimport sklearn.metrics as metrics\n%matplotlib inline","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"2dac3d66-b2d4-49df-8540-82d05eeff307"},{"cell_type":"code","source":"# %%timeit -n 1\n# Load your data -- start with data.csv... then Life Expectancy - and then anyone you choose\n# Replace with your dataset... for instance - if it is on github -use: https://raw.githubusercontent.com/fenago/introml/main/Life%20Expectancy%20Data.csv\ndf = pd.read_csv('data.csv')","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"c21f8a1a-a9d5-4277-868f-bef4e51785a9"},{"cell_type":"code","source":"len(df)","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"823b6b3a-c279-4b3e-9b75-75ab01f8ed0b"},{"cell_type":"code","source":"df.describe()","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"dfdc07fd-82d6-411c-bf4e-4012315538e1"},{"cell_type":"code","source":"df.shape","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"4f5bc2f4-a159-4a24-b02d-cf234b7daefa"},{"cell_type":"code","source":"df.nunique()","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"960afc07-cb45-4851-aa9b-6a564d943273"},{"cell_type":"code","source":"df.corr()","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"17fa6add-bdac-4d33-8b96-e102f3d13bc5"},{"cell_type":"code","source":"# Basic Data Cleaning\ndf.columns = df.columns.str.lower().str.replace(' ', '_') # A\n \nstring_columns = list(df.dtypes[df.dtypes == 'object'].index) # B\n \nfor col in string_columns:\n df[col] = df[col].str.lower().str.replace(' ', '_') # C","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"03c95958-f524-486f-84dd-62eccffe2985"},{"cell_type":"code","source":"df.head()","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"5694febc-4570-4e82-a40a-4e347e468eb6"},{"cell_type":"code","source":"df.head().T","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"cfd18cea-dbe5-4ac0-864c-922704a8f46a"},{"cell_type":"code","source":"# Replace with your target variable --- df.YOUR_TARGET_VARIABLE \n# Also replace your X label\nplt.figure(figsize=(6, 4))\n\nsns.histplot(df.life_expectancy_, bins=40, color='black', alpha=1)\nplt.ylabel('Frequency')\nplt.xlabel('life_expectancy_')\nplt.title('Distribution of prices')\n\nplt.show()","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"227d8604-3160-4070-a1b8-b7c2e71ce771"},{"cell_type":"code","source":"# This may not be needed for your dataset but explore with different features\nplt.figure(figsize=(6, 4))\n\nsns.histplot(df.msrp[df.msrp < 100000], bins=40, color='black', alpha=1)\nplt.ylabel('Frequency')\nplt.xlabel('Price')\nplt.title('Distribution of prices')\n\nplt.show()","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"bfc72552-0bc0-4da4-b4d9-b43d4ca28bbf"},{"cell_type":"code","source":"# This may not be needed for your dataset but explore with different features\n\nlog_price = np.log1p(df.msrp)\n\nplt.figure(figsize=(6, 4))\n\nsns.histplot(log_price, bins=40, color='black', alpha=1)\nplt.ylabel('Frequency')\nplt.xlabel('Log(Price + 1)')\nplt.title('Distribution of prices after log tranformation')\n\nplt.show()","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"dfe5a4c0-af19-44f8-a155-a6f36ca2ea2d"},{"cell_type":"code","source":"# Check for nulls --- you do NOT want nulls when you train\ndf.isnull().sum()","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"4ad70d6b-e458-4bb7-be22-f550a07aed9f"},{"cell_type":"code","source":"df.head().T","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"a8a3347d-5141-4b92-bb07-3dc2dbc1bf3a"},{"cell_type":"code","source":"#delete columns --- this may or may NOT be needed. As before - skip if you don't need it\n# You will encounter times where you will want to delete columns. This is how you do that.\ndf = df.drop(['x5_latitude', 'x6_longitude', 'x1_transaction_date'], axis=1)\ndf","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"2871e2ab-8677-43b4-bba1-36fe991438ab"},{"cell_type":"code","source":"'''\n# Split the data into test, train, validation sets... 60/20/20\nfrom sklearn.model_selection import train_test_split\n# This gives the 80/20 train test split\ndf_train_full, df_test = train_test_split(df, test_size=0.2, random_state=11)\n# This splits df_train_full again so it is 60/20/20\ndf_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=11)\nlen(df_train), len(df_val), len(df_test)\n# Replace nulls with 0's - these are pandas dataframes\ndf_train = df_train.fillna(0)\ndf_val = df_val.fillna(0)\ndf_test = df_test.fillna(0)\nlen(df_train),len(df_val),len(df_test)\n'''","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"fe1b5783-3f32-4440-81b7-1ae401f4ef09"},{"cell_type":"code","source":"# Split the data into test, train, validation sets... 80/20\nfrom sklearn.model_selection import train_test_split\n# This gives the 80/20 train test split\ndf_train_full, df_test = train_test_split(df, test_size=0.2, random_state=11)\n\nlen(df_train_full), len(df_test)\n# Replace nulls with 0's - these are pandas dataframes\ndf_train_full = df_train_full.fillna(0)\n\ndf_test = df_test.fillna(0)\nlen(df_train_full),len(df_test)","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"79babb7b-c583-4831-b555-54e68361b295"},{"cell_type":"code","source":"#Split the y out into train/test/splits... these are numpy ndarrays ... msrp is your target variables\n# Replace with your target variable!!! \ny_train = (df_train_full.life_expectancy_).values\ny_test = (df_test.life_expectancy_).values\ndel df_train_full['life_expectancy_']\ndel df_test['life_expectancy_']\n","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"033173c4-f018-4f63-b21b-f04209d392f7"},{"cell_type":"code","source":"# Convert these data frames into a LIST of DICTIONARIES (each element in the list is a dictionary (the record))\ndict_train = df_train_full.to_dict(orient='records')\ndict_test = df_test.to_dict(orient='records')","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"65eccd7c-38b2-4c32-9408-1c3e4eeb78e9"},{"cell_type":"code","source":"# Convert the LIST OF DICTIONARIES into a Feature Matrix (does all of the encoding)\nfrom sklearn.feature_extraction import DictVectorizer\n \ndv = DictVectorizer(sparse=False)\n \nX_train = dv.fit_transform(dict_train)\nX_test = dv.transform(dict_test)\nfeatures = dv.get_feature_names_out() #Features as they exist in the Vectorized Dictionary (this is an ndarray)","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"8430e24e-0e56-4b06-8b2b-465762403e19"},{"cell_type":"code","source":"# %%timeit -n 1\n# if you uncomment %%timeit it will not put lr into memory\n# Fit the model - this will take some time and will burn CPU (not MEMORY)\nlr = LinearRegression().fit(X_train, y_train)","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"81a922e7-f784-41d4-affe-a65d5efecb9a"},{"cell_type":"code","source":"# These are the model properties. You can call all of these\ndef get_properties(model): \n return [i for i in model.__dict__ if i.endswith('_')] \nget_properties(dt)","metadata":{},"execution_count":null,"outputs":[],"id":"78c36129-3437-4444-80f3-665ce054d529"},{"cell_type":"code","source":"type(X_train)\ntype(dv.get_feature_names_out())\ntype(lr.coef_)","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"4f6b5c2e-b5ea-41de-9943-da13c9b54e02"},{"cell_type":"code","source":"lr.coef_[0]\nprint('%.3f' % lr.intercept_, '(Intercept)') \n# Evaluate the coefficients to learn what the model thinks is important in the predictions.\nfor i,j in zip(features, lr.coef_): print('%.3f' % j, i)","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"a70eb705-485e-4e11-84f6-357949838a1a"},{"cell_type":"code","source":"# %%timeit -n 1\ny_pred = lr.predict(X_test)\ndf_results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})\ndf_results\n","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"f8525fbf-020e-4deb-a485-48f1839a2532"},{"cell_type":"code","source":"from sklearn import metrics\nprint('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))\nprint('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))\nprint('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"ad274bb7-2d00-4de7-b3fb-ace1d7f6315f"},{"cell_type":"code","source":"# View the coefficients\nprint(lr.intercept_)\nprint(lr.coef_)","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"c3271277-26f0-44b2-992b-1f94e8068de9"},{"cell_type":"code","source":"pred_y = lr.predict(X_test)\nprint(\"The first 10 prediction {}\".format(pred_y[:10].round(0)))\nprint(\"The real first 10 labels {}\".format(y_test[:10]))\n\nmse = metrics.mean_squared_error(y_test, pred_y)\nprint(\"Mean Squared Error {}\".format(mse))","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"4a0ca23f-81cd-4f0a-b815-66ab6d56dafd"},{"cell_type":"code","source":"type(df_train_full.head(1))","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"6458b9b4-53f5-4325-8cc8-e316fb20eae0"},{"cell_type":"code","source":"# Use double brackets around the iloc to force it to return a pandas dataframe and not a series\n# Then you can convert any record into a dictionary\ndf_train_full.iloc[[21]]","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"6cfd0fbc-b397-4c86-9be0-4d5b105c7a04"},{"cell_type":"code","source":"# How to convert any pandas row into a dictionary... needed for predictions\ndf_train_full.iloc[[213]].to_dict('records')[0]","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"0b519a52-11f8-4d03-9cb2-fee16a79acbf"},{"cell_type":"code","source":"# How to convert any pandas row into a dictionary... needed for predictions\ndf_train_full.head(21).to_dict('records')[0]","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"78bc6352-6bab-4356-9e6e-07c1a977a06f"},{"cell_type":"code","source":"#car = df_train.head(1).to_dict('records')[0]\nitem = df_train_full.iloc[[213]].to_dict('records')[0]\nactual = y_train[[213]]","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"8f58aa87-c643-4a94-b0ab-c0582ee4e18d"},{"cell_type":"code","source":"# The item to be predicted is passed in. \ndef model_prediction(item, dv, model):\n X = dv.transform([item])\n y_pred = model.predict(X)\n return y_pred[0]","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"0b8da488-76d2-4c2e-88b4-59f0390517e4"},{"cell_type":"code","source":"model_prediction(item,dv,lr)","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"25ee6832-7b4b-40f1-9f02-46f611d87968"},{"cell_type":"code","source":"actual","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"a2dbd283-4485-44e5-aa81-fdc0c0ebb813"},{"cell_type":"code","source":"lr.get_params()","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"940c8c4e-d164-4da0-8287-630f7306b78a"},{"cell_type":"code","source":"#Algorithm Test Harness for Regression Algorithms","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"8762b794-f60b-41f2-9fcc-2079fb9900bc"},{"cell_type":"code","source":"from sklearn.metrics import explained_variance_score,mean_absolute_error,r2_score","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"6efdf3d0-409e-4ef5-9bcd-a0dcfe618cce"},{"cell_type":"code","source":"from time import time\n\nfrom sklearn.linear_model import LinearRegression, Ridge,Lasso\nfrom sklearn.neighbors import KNeighborsRegressor\nfrom sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor\nfrom sklearn.tree import DecisionTreeRegressor","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"0ae79a3c-20b3-43d4-b2d9-d97883682424"},{"cell_type":"code","source":"regressors = [\n KNeighborsRegressor(),\n GradientBoostingRegressor(),\n ExtraTreesRegressor(),\n RandomForestRegressor(),\n DecisionTreeRegressor(),\n LinearRegression(),\n Lasso(),\n Ridge()\n]","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"6d97a1dc-f352-4671-a5bb-e93270548af7"},{"cell_type":"code","source":"head = 10\nfor model in regressors[:head]:\n start = time()\n model.fit(X_train, y_train)\n train_time = time() - start\n start = time()\n y_pred = model.predict(X_test)\n predict_time = time()-start \n print(model)\n print(\"\\tTraining time: %0.3fs\" % train_time)\n print(\"\\tPrediction time: %0.3fs\" % predict_time)\n print(\"\\tExplained variance:\", explained_variance_score(y_test, y_pred))\n print(\"\\tMean absolute error:\", mean_absolute_error(y_test, y_pred))\n print(\"\\tR2 score:\", r2_score(y_test, y_pred))\n print()","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"b3f9f2de-b0ea-467e-96c1-62e3d2ffd408"},{"cell_type":"markdown","source":"## R2 Score with Regression Models\nThe R2 score is a very important metric that is used to evaluate the performance of a regression-based machine learning model. It is pronounced as R squared and is also known as the coefficient of determination. It works by measuring the amount of variance in the predictions explained by the dataset. \n### Simply put, R2 is the difference between the samples in the dataset and the predictions made by the model.\nThe R2 score of the models trained here range from .91 to .98 (notice that LR does not converge so it's score is unknown). \n### If the value of the r squared score is 1, it means that the model is perfect and if its value is 0, it means that the model will perform badly on an unseen dataset. This also implies that the closer the value of the r squared score is to 1, the more perfectly the model is trained.\n# In summary - look for the models with the highest R2 values","metadata":{},"id":"d6cc64a0-1a2e-4483-be98-f9ca486578ea"},{"cell_type":"code","source":"# Assuming XGBoost is the best model - let's find the best hyperparameters\n\n# If you want to find the best parameters for a model - do a grid search over several features (or a random search)\n# A. Do model.get_params() to get all of the existing hyperparameters for the model\n# B. Create a dictionary with different hyperparameter options\n# C. Run the GridSearch and it will find the best parameters\n# D. Be patient because this will take a long time.","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"b0731910-24ac-442e-b953-de78bae987ed"},{"cell_type":"code","source":"from sklearn.model_selection import GridSearchCV","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"c91e4cf0-9bf4-4392-abbe-0263ae99bae0"},{"cell_type":"code","source":"parameters = { 'loss' : ['squared_error', 'lad', 'huber', 'quantile'],\n 'learning_rate' : (0.05,0.25,0.50,1),\n 'criterion' : ['friedman_mse', 'mse', 'mae'],\n 'max_features' : ['auto', 'sqrt', 'log2']\n }","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"836508f3-d03b-42b0-be95-ff8fe537e709"},{"cell_type":"code","source":"# This will take a very long time to execute (potentially hours) - skip it\ngrid = GridSearchCV(GradientBoostingRegressor(),parameters)\nmodel = grid.fit(X_train,y_train)\nprint(model.best_params_,'\\n')\nprint(model.best_estimator_,'\\n')","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"06f4e812-8293-4d6a-be91-136693bef9f2"},{"cell_type":"code","source":"{'criterion': 'friedman_mse', 'learning_rate': 0.25, 'loss': 'lad', 'max_features': 'sqrt'} \n\nGradientBoostingRegressor(learning_rate=0.25, loss='lad', max_features='sqrt') ","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"7e177e03-e5d7-4559-97c6-87af1879ff51"},{"cell_type":"code","source":"sns.distplot(y_pred-y_test)","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"ccaf602e-7b4e-4545-aa5a-29df5da78aa0"},{"cell_type":"code","source":"# If you skipped the grid search then this won't run... obviously\ndf_1 = pd.DataFrame(grid.cv_results_).set_index('rank_test_score').sort_index()\ndf_1.shape","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"c0314da1-a535-4f94-ba79-03c44b4fdbfe"},{"cell_type":"code","source":"#Hyperparameter Tuning \n# To get the best hyperparameters - call .get_params() on the model. \n# Then copy the parameters that you want to test into a dictionary list as you see below\n# The GridSearchCV will give you the best parameters","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"d27f14e1-6611-4f44-b2c6-92a183798142"},{"cell_type":"code","source":"from sklearn.linear_model import SGDRegressor\nsgd = SGDRegressor().fit(X_train, y_train)","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"3a5c6af0-6aa4-4427-b72e-feef97117794"},{"cell_type":"code","source":"sgd.get_params()","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"44b575ad-7e8b-4486-98f3-4c70510aa742"},{"cell_type":"code","source":"#This many parameters will take a very long time to load\nparam = {'alpha': [0.0001,0.001],\n 'average': [False,True],\n 'early_stopping': [False,True],\n 'epsilon': [0.1,.001],\n 'eta0': [0.01,.1],\n 'fit_intercept': [True,False],\n 'l1_ratio': [0.15,0.2,0.1],\n 'learning_rate': ['invscaling','optimal','constant','adaptive'],\n 'loss': ['squared_error','huber','epsilon_insensitive','squared_epsilon_insensitive'],\n 'max_iter': [1000],\n 'n_iter_no_change': [5,7],\n 'penalty': ['l2','l1','elasticnet'],\n 'power_t': [0.25],\n 'random_state': [None],\n 'shuffle': [True],\n 'tol': [0.001],\n 'validation_fraction': [0.1],\n 'verbose': [0],\n 'warm_start': [False]}","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"f7881392-37cd-405b-8f80-3282f763a90e"},{"cell_type":"code","source":"from sklearn.model_selection import RepeatedKFold\nfrom sklearn.model_selection import GridSearchCV\ncv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)\n# define search\nsearch = GridSearchCV(sgd, param, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv)\n# execute search\nresult = search.fit(X_train, y_train)","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"91bd0b8f-8fd4-4e9a-9823-d6f531790d63"},{"cell_type":"code","source":"# summarize result\nprint('Best Score: %s' % result.best_score_)\nprint('Best Hyperparameters: %s' % result.best_params_)","metadata":{"trusted":true},"execution_count":null,"outputs":[],"id":"46caaf03-88e3-4434-9ac2-0ce804f8e020"}]}