mirror of
https://github.com/fenago/data-science.git
synced 2026-05-04 08:31:59 +00:00
628 lines
12 KiB
Plaintext
628 lines
12 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text",
|
|
"id": "2fCtd4kCt__z"
|
|
},
|
|
"source": [
|
|
"# Lasso Regression"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "jG6YpAbot__3"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"from sklearn.linear_model import LinearRegression, Lasso\n",
|
|
"from sklearn.metrics import mean_squared_error\n",
|
|
"from sklearn.pipeline import Pipeline\n",
|
|
"from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "sGKCHZb0t__8"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"_df = pd.read_csv('../Dataset/ccpp.csv')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 230
|
|
},
|
|
"colab_type": "code",
|
|
"id": "MJzrSgbzuAAB",
|
|
"outputId": "efbd59f2-3b49-44f6-b186-e1d275f13cc6"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"_df.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "WVkCs_VpuAAH"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"X = _df.drop(['PE'], axis=1).values"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "J4PHvpCzuAAM"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"y = _df['PE'].values"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "tFOurIZ-uAAW"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_X, eval_X, train_y, eval_y = train_test_split(X, y, train_size=0.8, random_state=0)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text",
|
|
"id": "qChyCx_QuAAu"
|
|
},
|
|
"source": [
|
|
"# Implement a LinearRegression model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "mp6CUsPDuAA0"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"lr_model_1 = LinearRegression()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 35
|
|
},
|
|
"colab_type": "code",
|
|
"id": "f9w8TJiXuABY",
|
|
"outputId": "f38d02e0-36e7-4aa5-dc4a-fb3adf505ee0"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"lr_model_1.fit(train_X, train_y)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "MEGafhCSuABo"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"lr_model_1_preds = lr_model_1.predict(eval_X)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 35
|
|
},
|
|
"colab_type": "code",
|
|
"id": "WV0MFrBzuAB5",
|
|
"outputId": "58a1231f-3e1c-452c-e799-307254d3e149"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"print('lr_model_1 R2 Score: {}'.format(lr_model_1.score(eval_X, eval_y)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 35
|
|
},
|
|
"colab_type": "code",
|
|
"id": "oKUggTVmuACA",
|
|
"outputId": "23656741-ee91-4b1c-a3f8-cfb93650bc34"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"print('lr_model_1 MSE: {}'.format(mean_squared_error(eval_y, lr_model_1_preds)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text",
|
|
"id": "jqtZ38VhuACE"
|
|
},
|
|
"source": [
|
|
"# Engineer cubic features"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "jc3PXN3buACG"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"steps = [\n",
|
|
" ('scaler', MinMaxScaler()),\n",
|
|
" ('poly', PolynomialFeatures(degree=3)),\n",
|
|
" ('lr', LinearRegression())\n",
|
|
"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "SNCYPewxuACJ"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"lr_model_2 = Pipeline(steps)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 177
|
|
},
|
|
"colab_type": "code",
|
|
"id": "0vr1LXSsuACM",
|
|
"outputId": "27888045-5e27-4291-bc87-b96d761a4c20"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"lr_model_2.fit(train_X, train_y)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 35
|
|
},
|
|
"colab_type": "code",
|
|
"id": "0T__GpZEuACP",
|
|
"outputId": "cbddcba9-37cd-4115-8675-2ce482e625ae"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"print('lr_model_2 R2 Score: {}'.format(lr_model_2.score(eval_X, eval_y)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "hkcTjTXQuACT"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"lr_model_2_preds = lr_model_2.predict(eval_X)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 35
|
|
},
|
|
"colab_type": "code",
|
|
"id": "ijQLoniluACX",
|
|
"outputId": "7616aaf8-c9d3-4497-8810-ea16985493d8"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"print('lr_model_2 MSE: {}'.format(mean_squared_error(eval_y, lr_model_2_preds)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 177
|
|
},
|
|
"colab_type": "code",
|
|
"id": "2oCuH-VOuACa",
|
|
"outputId": "948c8f4a-0909-4e74-f60f-40050f071014"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(lr_model_2[-1].coef_)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 35
|
|
},
|
|
"colab_type": "code",
|
|
"id": "4pBIh88SuACf",
|
|
"outputId": "42d39731-1dbc-4385-8c3c-a83cb935e369"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(len(lr_model_2[-1].coef_))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text",
|
|
"id": "sRnkygHkuACn"
|
|
},
|
|
"source": [
|
|
"# Engineer polynomial features"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "Fgc0hLuiuACn"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"steps = [\n",
|
|
" ('scaler', MinMaxScaler()),\n",
|
|
" ('poly', PolynomialFeatures(degree=10)),\n",
|
|
" ('lr', LinearRegression())\n",
|
|
"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "ENi3637QuACr"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"lr_model_3 = Pipeline(steps)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 0
|
|
},
|
|
"colab_type": "code",
|
|
"id": "6Fa3hJ4duACx",
|
|
"outputId": "9faa5eb7-1ed0-40b0-b99d-c0ba085b4d5f"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"lr_model_3.fit(train_X, train_y)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 0
|
|
},
|
|
"colab_type": "code",
|
|
"id": "dvVQIDtNuAC1",
|
|
"outputId": "3448cfd7-6ad9-4b58-ccb8-3cfb0e04aea7"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"print('lr_model_3 R2 Score: {}'.format(lr_model_3.score(eval_X, eval_y)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "g_J5KhfCuAC7"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"lr_model_3_preds = lr_model_3.predict(eval_X)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 0
|
|
},
|
|
"colab_type": "code",
|
|
"id": "phBeRc0FuADC",
|
|
"outputId": "4de4e58b-e326-4639-d0c0-08a40456ee04"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"print('lr_model_3 MSE: {}'.format(mean_squared_error(eval_y, lr_model_3_preds)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 0
|
|
},
|
|
"colab_type": "code",
|
|
"id": "Bx8QmaJduADL",
|
|
"outputId": "1832ef7c-f3a3-42f7-aba9-d4a4d640135b"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(len(lr_model_3[-1].coef_))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 0
|
|
},
|
|
"colab_type": "code",
|
|
"id": "omV4-ydJuADT",
|
|
"outputId": "4ffe3e60-4949-4a96-8ad8-042bb5551ea0"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(lr_model_3[-1].coef_[:35])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text",
|
|
"id": "R3KTPZd5uADY"
|
|
},
|
|
"source": [
|
|
"# Implement Lasso on the same pipeline"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "wvtRmkOLuADc"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"steps = [\n",
|
|
" ('scaler', MinMaxScaler()),\n",
|
|
" ('poly', PolynomialFeatures(degree=10)),\n",
|
|
" ('lr', Lasso(alpha=0.01))\n",
|
|
"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "VKkWpShFuADi"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"lasso_model = Pipeline(steps)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 212
|
|
},
|
|
"colab_type": "code",
|
|
"id": "CFUKWqGSuAD9",
|
|
"outputId": "304b16fe-a245-4278-8219-4d0f466e21ea"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"lasso_model.fit(train_X, train_y)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 35
|
|
},
|
|
"colab_type": "code",
|
|
"id": "_bOVRGGeuAEh",
|
|
"outputId": "6ff11eaa-45ec-4dab-b807-dbdc27d57093"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"print('lasso_model R2 Score: {}'.format(lasso_model.score(eval_X, eval_y)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "B0sCTyzxuAE1"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"lasso_preds = lasso_model.predict(eval_X)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 35
|
|
},
|
|
"colab_type": "code",
|
|
"id": "terkLrI6uAE7",
|
|
"outputId": "5804a278-f495-4a16-a703-2621378d370d"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"print('lasso_model MSE: {}'.format(mean_squared_error(eval_y, lasso_preds)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 35
|
|
},
|
|
"colab_type": "code",
|
|
"id": "2y24Ljr_uAFG",
|
|
"outputId": "759e6c34-cee9-4f1e-9791-408aad4e4b20"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(len(lasso_model[-1].coef_))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 141
|
|
},
|
|
"colab_type": "code",
|
|
"id": "9fPTFs6CuAFV",
|
|
"outputId": "d15394eb-8ff7-4f36-af46-b61e12f49b97"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(lasso_model[-1].coef_[:35])"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"name": "Exercise7.09.ipynb",
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 1
|
|
}
|