mirror of
https://github.com/fenago/data-science.git
synced 2026-05-04 00:22:32 +00:00
206 lines
4.7 KiB
Plaintext
206 lines
4.7 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "OpvvISIrbLkH"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"from sklearn.cluster import KMeans\n",
|
|
"import altair as alt"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "arwd2VGxbQw_"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"file_url = '../DataSet/taxstats2015.csv'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "yTrw_H8CbYc5"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = pd.read_csv(file_url, usecols=['Postcode', 'Average total business income', 'Average total business expenses'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "Ch3Uw25bbbfs"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"X = df[['Average total business income', 'Average total business expenses']]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 68
|
|
},
|
|
"colab_type": "code",
|
|
"id": "wcrvaHY6bdsX",
|
|
"outputId": "d0d8fc95-1826-4529-9cf6-b60274ba2c82"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"kmeans = KMeans(random_state=1, n_clusters=4, init='random', n_init=1)\n",
|
|
"kmeans.fit(X)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "STeQ3-U6bgXr"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df['cluster3'] = kmeans.predict(X)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "vmmjs2-Abgb0"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"scatter_plot = alt.Chart(df).mark_circle()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 368
|
|
},
|
|
"colab_type": "code",
|
|
"id": "7RGhQXCpbklC",
|
|
"outputId": "84a545ec-d4c9-42b2-f202-024bac3bcf0a"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"scatter_plot.encode(x='Average total business income', y='Average total business expenses',color='cluster3:N',\n",
|
|
"tooltip=['Postcode', 'cluster3', 'Average total business income', 'Average total business expenses']\n",
|
|
").interactive()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 368
|
|
},
|
|
"colab_type": "code",
|
|
"id": "CDqsVn5lbknb",
|
|
"outputId": "cdc68274-c524-45e0-f787-e81ab1483b03"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"kmeans = KMeans(random_state=1, n_clusters=4, init='random', n_init=10)\n",
|
|
"kmeans.fit(X)\n",
|
|
"df['cluster4'] = kmeans.predict(X)\n",
|
|
"scatter_plot = alt.Chart(df).mark_circle()\n",
|
|
"\n",
|
|
"scatter_plot.encode(x='Average total business income', y='Average total business expenses',color='cluster4:N',\n",
|
|
"tooltip=['Postcode', 'cluster4', 'Average total business income', 'Average total business expenses']\n",
|
|
").interactive()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 368
|
|
},
|
|
"colab_type": "code",
|
|
"id": "GpLqIG4hbkpv",
|
|
"outputId": "d945fa30-667c-47e7-8f10-81f105d6a7af"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"kmeans = KMeans(random_state=1, n_clusters=4, init='random', n_init=100)\n",
|
|
"kmeans.fit(X)\n",
|
|
"df['cluster5'] = kmeans.predict(X)\n",
|
|
"scatter_plot = alt.Chart(df).mark_circle()\n",
|
|
"scatter_plot.encode(x='Average total business income', y='Average total business expenses',color='cluster5:N',\n",
|
|
"tooltip=['Postcode', 'cluster5', 'Average total business income', 'Average total business expenses']\n",
|
|
").interactive()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "dhTRyHeMbkrz"
|
|
},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"collapsed_sections": [],
|
|
"name": "Exercise5_04.ipynb",
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 1
|
|
}
|