{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "zVIPgXoWlEXC" }, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.cluster import KMeans\n", "import altair as alt\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "EhSFCirNlG5J" }, "outputs": [], "source": [ "file_url = '../DataSet/taxstats2015.csv'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "T9mNYH_WlLIG" }, "outputs": [], "source": [ "df = pd.read_csv(file_url, usecols=['Postcode', 'Average total business income', 'Average total business expenses'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 359 }, "colab_type": "code", "id": "yLUsDBmRlMot", "outputId": "f1c45d02-9230-4a93-d165-56a7f4f43424" }, "outputs": [], "source": [ "df.tail(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "n_Sfij90lOVg" }, "outputs": [], "source": [ "X = df[['Average total business income', 'Average total business expenses']]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 68 }, "colab_type": "code", "id": "w6ZLzCgGlbsG", "outputId": "ee95a798-8acf-4cd8-c44e-ec4576123b8d" }, "outputs": [], "source": [ "kmeans = KMeans(random_state=8)\n", "kmeans.fit(X)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "colab_type": "code", "id": "VX56G-A0ldDY", "outputId": "222b701c-4d67-4c78-dede-c160dc426171" }, "outputs": [], "source": [ "y_preds = kmeans.predict(X)\n", "y_preds[-10:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 359 }, "colab_type": "code", "id": "aPL8x8zalpvO", "outputId": "4708c18a-4619-4c2d-d13f-35857847408f" }, "outputs": [], "source": [ "df['cluster'] = y_preds\n", "df.tail(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 328 }, "colab_type": "code", "id": "GrYs7MjIl1Iu", "outputId": "6ef6192e-74d5-46b9-9abb-87a788a6ad01" }, "outputs": [], "source": [ "df.pivot_table(values=['Average total business income', 'Average total business expenses'], index='cluster', aggfunc=np.mean)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "TnY8IrMlmBWE" }, "outputs": [], "source": [ "scatter_plot = alt.Chart(df).mark_circle()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 368 }, "colab_type": "code", "id": "hp4SnbDAmMvE", "outputId": "07dfcef2-dd0b-468c-ca6c-1c0302ff9f14" }, "outputs": [], "source": [ "scatter_plot.encode(x='Average total business income', y='Average total business expenses', color='cluster:N', tooltip=['Postcode', 'cluster', 'Average total business income', 'Average total business expenses']).interactive()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "uhcS5AmkmQ10" }, "outputs": [], "source": [] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "Exercise5_2.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.6" } }, "nbformat": 4, "nbformat_minor": 1 }