mlessentials/Lab05/Activity5.01/Activity5_1.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "i0_6mTjFH-_v"
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.cluster import KMeans\n",
    "import altair as alt\n",
    "from sklearn.preprocessing import StandardScaler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "pd3ELQrbIUvQ"
   },
   "outputs": [],
   "source": [
    "file_url = '../DataSet/german.data-numeric'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 71
    },
    "colab_type": "code",
    "id": "652VWTRkIZOc",
    "outputId": "9fae4df0-e2b3-4f8b-8c98-bf541d1dbf90"
   },
   "outputs": [],
   "source": [
    "df = pd.read_csv(file_url, header=None, sep='\\s\\s+', prefix='X')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 204
    },
    "colab_type": "code",
    "id": "FKd0dGxfIat_",
    "outputId": "a34b7d62-6ab2-4c9a-e194-e185ad0b4ab3"
   },
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "QQs2uYP2Jpe6"
   },
   "outputs": [],
   "source": [
    "X = df[['X3', 'X9']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "A3JCcQwtK2M_"
   },
   "outputs": [],
   "source": [
    "standard_scaler = StandardScaler()\n",
    "X_scaled = standard_scaler.fit_transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "FAxew7byK8_k"
   },
   "outputs": [],
   "source": [
    "clusters = pd.DataFrame()\n",
    "inertia = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "YxB1jOrUK_Ky"
   },
   "outputs": [],
   "source": [
    "clusters['cluster_range'] = range(1, 15)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "Wj9qG7XWLAis"
   },
   "outputs": [],
   "source": [
    "for k in clusters['cluster_range']:\n",
    "    kmeans = KMeans(n_clusters=k, random_state=8).fit(X_scaled)\n",
    "    inertia.append(kmeans.inertia_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "n8a_y67oLCEb"
   },
   "outputs": [],
   "source": [
    "clusters['inertia'] = inertia"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 483
    },
    "colab_type": "code",
    "id": "ChBnw2jMLDq8",
    "outputId": "42868bc9-afee-4fcc-b2ed-352329e12880"
   },
   "outputs": [],
   "source": [
    "clusters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 368
    },
    "colab_type": "code",
    "id": "xm0azWh-LEwX",
    "outputId": "678485f8-dea9-4528-dcf8-2d6b4733611b"
   },
   "outputs": [],
   "source": [
    "alt.Chart(clusters).mark_line().encode(alt.X('cluster_range'), alt.Y('inertia'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "nJF3LIAFLctS"
   },
   "outputs": [],
   "source": [
    "clusters_number = 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 68
    },
    "colab_type": "code",
    "id": "gZZnYiSyL60J",
    "outputId": "ed057701-8ac7-47bf-952c-9434feb2520e"
   },
   "outputs": [],
   "source": [
    "kmeans = KMeans(random_state=1, n_clusters=clusters_number, init='k-means++', n_init=50, max_iter=1000)\n",
    "kmeans.fit(X_scaled)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "klTbcYaQL8F6"
   },
   "outputs": [],
   "source": [
    "df['cluster'] = kmeans.predict(X_scaled)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 368
    },
    "colab_type": "code",
    "id": "IKt44R6CL9cY",
    "outputId": "a5570db0-7565-462f-d78c-26f554fc87ca"
   },
   "outputs": [],
   "source": [
    "scatter_plot = alt.Chart(df).mark_circle()\n",
    "scatter_plot.encode(x='X3', y='X9',color='cluster:N')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "3_Np9pJiL-6Z"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "Activity5_1.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}