mirror of
https://github.com/fenago/data-science.git
synced 2026-05-04 16:41:05 +00:00
397 lines
9.9 KiB
Plaintext
397 lines
9.9 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "teVYTWp62NMf"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"from sklearn.cluster import KMeans\n",
|
|
"import altair as alt"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "b_w9MPzrszPF"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"file_url = '../DataSet/taxstats2015.csv'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "9dP6E2ye3Jia"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = pd.read_csv(file_url, usecols=['Postcode', 'Average total business income', 'Average total business expenses'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "6sx1xL3h4Ffz"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"X = df[['Average total business income', 'Average total business expenses']]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "k5zU1YmA4W-T"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"business_income_min = df['Average total business income'].min()\n",
|
|
"business_income_max = df['Average total business income'].max()\n",
|
|
"\n",
|
|
"business_expenses_min = df['Average total business expenses'].min()\n",
|
|
"business_expenses_max = df['Average total business expenses'].max()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 85
|
|
},
|
|
"colab_type": "code",
|
|
"id": "2IQm7qpU4p_S",
|
|
"outputId": "7dc1ba5c-8b46-4fde-b910-fac098ab6c8a"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(business_income_min)\n",
|
|
"print(business_income_max)\n",
|
|
"print(business_expenses_min)\n",
|
|
"print(business_expenses_max)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "UlHRqugF4unX"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import random\n",
|
|
"random.seed(42)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "eqGlqkPi5Uo2"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"centroids = pd.DataFrame()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "dMCfxYzA5Vv8"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"centroids['Average total business income'] = random.sample(range(business_income_min, business_income_max), 4)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "CE7GGrvv5wBm"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"centroids['Average total business expenses'] = random.sample(range(business_expenses_min, business_expenses_max), 4)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 173
|
|
},
|
|
"colab_type": "code",
|
|
"id": "zBW4apPX5xb2",
|
|
"outputId": "f6a9fec5-9075-489d-dd48-df3974f28eb9"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"centroids['cluster'] = centroids.index\n",
|
|
"centroids"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "eIA0hzt16xx7"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"chart1 = alt.Chart(df.head()).mark_circle().encode(x='Average total business income', y='Average total business expenses', \n",
|
|
" color=alt.value('orange'),\n",
|
|
" tooltip=['Postcode', 'Average total business income', 'Average total business expenses']\n",
|
|
").interactive()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "7PGvAyEL63_R"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"chart2 = alt.Chart(centroids).mark_circle(size=100).encode(x='Average total business income', y='Average total business expenses', \n",
|
|
" color=alt.value('black'),\n",
|
|
" tooltip=['cluster', 'Average total business income', 'Average total business expenses']\n",
|
|
").interactive()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 363
|
|
},
|
|
"colab_type": "code",
|
|
"id": "Xm4FTXtp65ZB",
|
|
"outputId": "b824d0a2-1440-42a4-bd21-4d12f9b1454d"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"chart1 + chart2"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "eQKhecFQ66sa"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def squared_euclidean(data_x, data_y, centroid_x, centroid_y, ):\n",
|
|
" return (data_x - centroid_x)**2 + (data_y - centroid_y)**2"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "ORuNfqEy7zB5"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"data_x = df.at[0, 'Average total business income']\n",
|
|
"data_y = df.at[0, 'Average total business expenses']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 34
|
|
},
|
|
"colab_type": "code",
|
|
"id": "yakWRQoN70iU",
|
|
"outputId": "abf5ef31-afc7-4eee-9b68-051a883212f6"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"distances = [squared_euclidean(data_x, data_y, centroids.at[i, 'Average total business income'], centroids.at[i, 'Average total business expenses']) for i in range(4)]\n",
|
|
"distances"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "azLmJ_FB7189"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"cluster_index = distances.index(min(distances))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "wA2OZBYb9NoJ"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.at[0, 'cluster'] = cluster_index"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 204
|
|
},
|
|
"colab_type": "code",
|
|
"id": "1kSpYhB89PBf",
|
|
"outputId": "0905b440-75c5-42b8-ef3b-aa277673e9cd"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 204
|
|
},
|
|
"colab_type": "code",
|
|
"id": "yj34XK-N9QKH",
|
|
"outputId": "6b360e91-2cea-4f4d-af87-7d0e8f58f80e"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"distances = [squared_euclidean(df.at[1, 'Average total business income'], df.at[1, 'Average total business expenses'], centroids.at[i, 'Average total business income'], centroids.at[i, 'Average total business expenses']) for i in range(4)]\n",
|
|
"df.at[1, 'cluster'] = distances.index(min(distances))\n",
|
|
"\n",
|
|
"distances = [squared_euclidean(df.at[2, 'Average total business income'], df.at[2, 'Average total business expenses'], centroids.at[i, 'Average total business income'], centroids.at[i, 'Average total business expenses']) for i in range(4)]\n",
|
|
"df.at[2, 'cluster'] = distances.index(min(distances))\n",
|
|
"\n",
|
|
"distances = [squared_euclidean(df.at[3, 'Average total business income'], df.at[3, 'Average total business expenses'], centroids.at[i, 'Average total business income'], centroids.at[i, 'Average total business expenses']) for i in range(4)]\n",
|
|
"df.at[3, 'cluster'] = distances.index(min(distances))\n",
|
|
"\n",
|
|
"distances = [squared_euclidean(df.at[4, 'Average total business income'], df.at[4, 'Average total business expenses'], centroids.at[i, 'Average total business income'], centroids.at[i, 'Average total business expenses']) for i in range(4)]\n",
|
|
"df.at[4, 'cluster'] = distances.index(min(distances))\n",
|
|
"\n",
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 363
|
|
},
|
|
"colab_type": "code",
|
|
"id": "Gbodvbg190ee",
|
|
"outputId": "879b69ec-52e1-473f-e9a0-9fcfe7e348d5"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"chart1 = alt.Chart(df.head()).mark_circle().encode(x='Average total business income', y='Average total business expenses', \n",
|
|
" color='cluster:N',\n",
|
|
" tooltip=['Postcode', 'cluster', 'Average total business income', 'Average total business expenses']\n",
|
|
").interactive()\n",
|
|
"\n",
|
|
"chart2 = alt.Chart(centroids).mark_circle(size=100).encode(x='Average total business income', y='Average total business expenses', \n",
|
|
" color=alt.value('black'),\n",
|
|
" tooltip=['cluster', 'Average total business income', 'Average total business expenses']\n",
|
|
").interactive()\n",
|
|
"chart1 + chart2"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "waR0wfwn-OjS"
|
|
},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"collapsed_sections": [],
|
|
"name": "Exercise5_4.ipynb",
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 1
|
|
}
|