Files
fenago f3b24b4b7f added
2021-02-07 15:16:01 +05:00

397 lines
9.9 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "teVYTWp62NMf"
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.cluster import KMeans\n",
"import altair as alt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "b_w9MPzrszPF"
},
"outputs": [],
"source": [
"file_url = '../DataSet/taxstats2015.csv'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "9dP6E2ye3Jia"
},
"outputs": [],
"source": [
"df = pd.read_csv(file_url, usecols=['Postcode', 'Average total business income', 'Average total business expenses'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "6sx1xL3h4Ffz"
},
"outputs": [],
"source": [
"X = df[['Average total business income', 'Average total business expenses']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "k5zU1YmA4W-T"
},
"outputs": [],
"source": [
"business_income_min = df['Average total business income'].min()\n",
"business_income_max = df['Average total business income'].max()\n",
"\n",
"business_expenses_min = df['Average total business expenses'].min()\n",
"business_expenses_max = df['Average total business expenses'].max()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 85
},
"colab_type": "code",
"id": "2IQm7qpU4p_S",
"outputId": "7dc1ba5c-8b46-4fde-b910-fac098ab6c8a"
},
"outputs": [],
"source": [
"print(business_income_min)\n",
"print(business_income_max)\n",
"print(business_expenses_min)\n",
"print(business_expenses_max)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "UlHRqugF4unX"
},
"outputs": [],
"source": [
"import random\n",
"random.seed(42)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "eqGlqkPi5Uo2"
},
"outputs": [],
"source": [
"centroids = pd.DataFrame()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "dMCfxYzA5Vv8"
},
"outputs": [],
"source": [
"centroids['Average total business income'] = random.sample(range(business_income_min, business_income_max), 4)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "CE7GGrvv5wBm"
},
"outputs": [],
"source": [
"centroids['Average total business expenses'] = random.sample(range(business_expenses_min, business_expenses_max), 4)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 173
},
"colab_type": "code",
"id": "zBW4apPX5xb2",
"outputId": "f6a9fec5-9075-489d-dd48-df3974f28eb9"
},
"outputs": [],
"source": [
"centroids['cluster'] = centroids.index\n",
"centroids"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "eIA0hzt16xx7"
},
"outputs": [],
"source": [
"chart1 = alt.Chart(df.head()).mark_circle().encode(x='Average total business income', y='Average total business expenses', \n",
" color=alt.value('orange'),\n",
" tooltip=['Postcode', 'Average total business income', 'Average total business expenses']\n",
").interactive()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "7PGvAyEL63_R"
},
"outputs": [],
"source": [
"chart2 = alt.Chart(centroids).mark_circle(size=100).encode(x='Average total business income', y='Average total business expenses', \n",
" color=alt.value('black'),\n",
" tooltip=['cluster', 'Average total business income', 'Average total business expenses']\n",
").interactive()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 363
},
"colab_type": "code",
"id": "Xm4FTXtp65ZB",
"outputId": "b824d0a2-1440-42a4-bd21-4d12f9b1454d"
},
"outputs": [],
"source": [
"chart1 + chart2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "eQKhecFQ66sa"
},
"outputs": [],
"source": [
"def squared_euclidean(data_x, data_y, centroid_x, centroid_y, ):\n",
" return (data_x - centroid_x)**2 + (data_y - centroid_y)**2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "ORuNfqEy7zB5"
},
"outputs": [],
"source": [
"data_x = df.at[0, 'Average total business income']\n",
"data_y = df.at[0, 'Average total business expenses']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"id": "yakWRQoN70iU",
"outputId": "abf5ef31-afc7-4eee-9b68-051a883212f6"
},
"outputs": [],
"source": [
"distances = [squared_euclidean(data_x, data_y, centroids.at[i, 'Average total business income'], centroids.at[i, 'Average total business expenses']) for i in range(4)]\n",
"distances"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "azLmJ_FB7189"
},
"outputs": [],
"source": [
"cluster_index = distances.index(min(distances))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "wA2OZBYb9NoJ"
},
"outputs": [],
"source": [
"df.at[0, 'cluster'] = cluster_index"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"colab_type": "code",
"id": "1kSpYhB89PBf",
"outputId": "0905b440-75c5-42b8-ef3b-aa277673e9cd"
},
"outputs": [],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"colab_type": "code",
"id": "yj34XK-N9QKH",
"outputId": "6b360e91-2cea-4f4d-af87-7d0e8f58f80e"
},
"outputs": [],
"source": [
"distances = [squared_euclidean(df.at[1, 'Average total business income'], df.at[1, 'Average total business expenses'], centroids.at[i, 'Average total business income'], centroids.at[i, 'Average total business expenses']) for i in range(4)]\n",
"df.at[1, 'cluster'] = distances.index(min(distances))\n",
"\n",
"distances = [squared_euclidean(df.at[2, 'Average total business income'], df.at[2, 'Average total business expenses'], centroids.at[i, 'Average total business income'], centroids.at[i, 'Average total business expenses']) for i in range(4)]\n",
"df.at[2, 'cluster'] = distances.index(min(distances))\n",
"\n",
"distances = [squared_euclidean(df.at[3, 'Average total business income'], df.at[3, 'Average total business expenses'], centroids.at[i, 'Average total business income'], centroids.at[i, 'Average total business expenses']) for i in range(4)]\n",
"df.at[3, 'cluster'] = distances.index(min(distances))\n",
"\n",
"distances = [squared_euclidean(df.at[4, 'Average total business income'], df.at[4, 'Average total business expenses'], centroids.at[i, 'Average total business income'], centroids.at[i, 'Average total business expenses']) for i in range(4)]\n",
"df.at[4, 'cluster'] = distances.index(min(distances))\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 363
},
"colab_type": "code",
"id": "Gbodvbg190ee",
"outputId": "879b69ec-52e1-473f-e9a0-9fcfe7e348d5"
},
"outputs": [],
"source": [
"chart1 = alt.Chart(df.head()).mark_circle().encode(x='Average total business income', y='Average total business expenses', \n",
" color='cluster:N',\n",
" tooltip=['Postcode', 'cluster', 'Average total business income', 'Average total business expenses']\n",
").interactive()\n",
"\n",
"chart2 = alt.Chart(centroids).mark_circle(size=100).encode(x='Average total business income', y='Average total business expenses', \n",
" color=alt.value('black'),\n",
" tooltip=['cluster', 'Average total business income', 'Average total business expenses']\n",
").interactive()\n",
"chart1 + chart2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "waR0wfwn-OjS"
},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "Exercise5_4.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.6"
}
},
"nbformat": 4,
"nbformat_minor": 1
}