{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "teVYTWp62NMf" }, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.cluster import KMeans\n", "import altair as alt" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "b_w9MPzrszPF" }, "outputs": [], "source": [ "file_url = '../DataSet/taxstats2015.csv'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "9dP6E2ye3Jia" }, "outputs": [], "source": [ "df = pd.read_csv(file_url, usecols=['Postcode', 'Average total business income', 'Average total business expenses'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "6sx1xL3h4Ffz" }, "outputs": [], "source": [ "X = df[['Average total business income', 'Average total business expenses']]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "k5zU1YmA4W-T" }, "outputs": [], "source": [ "business_income_min = df['Average total business income'].min()\n", "business_income_max = df['Average total business income'].max()\n", "\n", "business_expenses_min = df['Average total business expenses'].min()\n", "business_expenses_max = df['Average total business expenses'].max()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 85 }, "colab_type": "code", "id": "2IQm7qpU4p_S", "outputId": "7dc1ba5c-8b46-4fde-b910-fac098ab6c8a" }, "outputs": [], "source": [ "print(business_income_min)\n", "print(business_income_max)\n", "print(business_expenses_min)\n", "print(business_expenses_max)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "UlHRqugF4unX" }, "outputs": [], "source": [ "import random\n", "random.seed(42)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "eqGlqkPi5Uo2" }, "outputs": [], "source": [ "centroids = pd.DataFrame()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "dMCfxYzA5Vv8" }, "outputs": [], "source": [ "centroids['Average total business income'] = random.sample(range(business_income_min, business_income_max), 4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "CE7GGrvv5wBm" }, "outputs": [], "source": [ "centroids['Average total business expenses'] = random.sample(range(business_expenses_min, business_expenses_max), 4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 173 }, "colab_type": "code", "id": "zBW4apPX5xb2", "outputId": "f6a9fec5-9075-489d-dd48-df3974f28eb9" }, "outputs": [], "source": [ "centroids['cluster'] = centroids.index\n", "centroids" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "eIA0hzt16xx7" }, "outputs": [], "source": [ "chart1 = alt.Chart(df.head()).mark_circle().encode(x='Average total business income', y='Average total business expenses', \n", " color=alt.value('orange'),\n", " tooltip=['Postcode', 'Average total business income', 'Average total business expenses']\n", ").interactive()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "7PGvAyEL63_R" }, "outputs": [], "source": [ "chart2 = alt.Chart(centroids).mark_circle(size=100).encode(x='Average total business income', y='Average total business expenses', \n", " color=alt.value('black'),\n", " tooltip=['cluster', 'Average total business income', 'Average total business expenses']\n", ").interactive()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 363 }, "colab_type": "code", "id": "Xm4FTXtp65ZB", "outputId": "b824d0a2-1440-42a4-bd21-4d12f9b1454d" }, "outputs": [], "source": [ "chart1 + chart2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "eQKhecFQ66sa" }, "outputs": [], "source": [ "def squared_euclidean(data_x, data_y, centroid_x, centroid_y, ):\n", " return (data_x - centroid_x)**2 + (data_y - centroid_y)**2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "ORuNfqEy7zB5" }, "outputs": [], "source": [ "data_x = df.at[0, 'Average total business income']\n", "data_y = df.at[0, 'Average total business expenses']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "colab_type": "code", "id": "yakWRQoN70iU", "outputId": "abf5ef31-afc7-4eee-9b68-051a883212f6" }, "outputs": [], "source": [ "distances = [squared_euclidean(data_x, data_y, centroids.at[i, 'Average total business income'], centroids.at[i, 'Average total business expenses']) for i in range(4)]\n", "distances" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "azLmJ_FB7189" }, "outputs": [], "source": [ "cluster_index = distances.index(min(distances))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "wA2OZBYb9NoJ" }, "outputs": [], "source": [ "df.at[0, 'cluster'] = cluster_index" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "colab_type": "code", "id": "1kSpYhB89PBf", "outputId": "0905b440-75c5-42b8-ef3b-aa277673e9cd" }, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "colab_type": "code", "id": "yj34XK-N9QKH", "outputId": "6b360e91-2cea-4f4d-af87-7d0e8f58f80e" }, "outputs": [], "source": [ "distances = [squared_euclidean(df.at[1, 'Average total business income'], df.at[1, 'Average total business expenses'], centroids.at[i, 'Average total business income'], centroids.at[i, 'Average total business expenses']) for i in range(4)]\n", "df.at[1, 'cluster'] = distances.index(min(distances))\n", "\n", "distances = [squared_euclidean(df.at[2, 'Average total business income'], df.at[2, 'Average total business expenses'], centroids.at[i, 'Average total business income'], centroids.at[i, 'Average total business expenses']) for i in range(4)]\n", "df.at[2, 'cluster'] = distances.index(min(distances))\n", "\n", "distances = [squared_euclidean(df.at[3, 'Average total business income'], df.at[3, 'Average total business expenses'], centroids.at[i, 'Average total business income'], centroids.at[i, 'Average total business expenses']) for i in range(4)]\n", "df.at[3, 'cluster'] = distances.index(min(distances))\n", "\n", "distances = [squared_euclidean(df.at[4, 'Average total business income'], df.at[4, 'Average total business expenses'], centroids.at[i, 'Average total business income'], centroids.at[i, 'Average total business expenses']) for i in range(4)]\n", "df.at[4, 'cluster'] = distances.index(min(distances))\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 363 }, "colab_type": "code", "id": "Gbodvbg190ee", "outputId": "879b69ec-52e1-473f-e9a0-9fcfe7e348d5" }, "outputs": [], "source": [ "chart1 = alt.Chart(df.head()).mark_circle().encode(x='Average total business income', y='Average total business expenses', \n", " color='cluster:N',\n", " tooltip=['Postcode', 'cluster', 'Average total business income', 'Average total business expenses']\n", ").interactive()\n", "\n", "chart2 = alt.Chart(centroids).mark_circle(size=100).encode(x='Average total business income', y='Average total business expenses', \n", " color=alt.value('black'),\n", " tooltip=['cluster', 'Average total business income', 'Average total business expenses']\n", ").interactive()\n", "chart1 + chart2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "waR0wfwn-OjS" }, "outputs": [], "source": [] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "Exercise5_4.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.6" } }, "nbformat": 4, "nbformat_minor": 1 }