mirror of
https://github.com/fenago/data-science.git
synced 2026-05-05 00:51:50 +00:00
250 lines
5.4 KiB
Plaintext
250 lines
5.4 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "zEQuq8LJA3tn"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"from sklearn.cluster import KMeans\n",
|
|
"import altair as alt "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "ZgMleSBTtEO6"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"file_url = '../DataSet/taxstats2015.csv'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "OyDwHaKiCDYQ"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = pd.read_csv(file_url, usecols=['Postcode', 'Average total business income', 'Average total business expenses'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "vnxGsM2kCGUp"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"X = df[['Average total business income', 'Average total business expenses']]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "ih5PpKnICH_7"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from sklearn.preprocessing import MinMaxScaler\n",
|
|
"from sklearn.preprocessing import StandardScaler"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 34
|
|
},
|
|
"colab_type": "code",
|
|
"id": "WywT72WzCJUk",
|
|
"outputId": "2245ba90-b235-4061-f348-bb111e48c3e7"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"min_max_scaler = MinMaxScaler()\n",
|
|
"min_max_scaler.fit(X)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 136
|
|
},
|
|
"colab_type": "code",
|
|
"id": "LFbkKgsxCKq7",
|
|
"outputId": "ac247615-4f3d-4756-fe18-805c02dd558c"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"X_min_max = min_max_scaler.transform(X)\n",
|
|
"X_min_max"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 68
|
|
},
|
|
"colab_type": "code",
|
|
"id": "5OehxZ5BCgG9",
|
|
"outputId": "ca35030a-786b-449c-947f-473c40758fb8"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"kmeans = KMeans(random_state=1, n_clusters=4, init='k-means++', n_init=5)\n",
|
|
"kmeans.fit(X_min_max) "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "HF_9Aw8OC-5u"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df['cluster8'] = kmeans.predict(X_min_max)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 368
|
|
},
|
|
"colab_type": "code",
|
|
"id": "D2-atFYFDIae",
|
|
"outputId": "9e24a0cc-1918-433a-cae5-5c2f5480d8a8"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"scatter_plot = alt.Chart(df).mark_circle()\n",
|
|
"scatter_plot.encode(x='Average total business income', y='Average total business expenses',color='cluster8:N',\n",
|
|
" tooltip=['Postcode', 'cluster8', 'Average total business income', 'Average total business expenses']\n",
|
|
").interactive() "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 68
|
|
},
|
|
"colab_type": "code",
|
|
"id": "kvzQBGgvDVPO",
|
|
"outputId": "329e0b77-e79e-45ea-dba6-89154eb8aa29"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"standard_scaler = StandardScaler()\n",
|
|
"X_scaled = standard_scaler.fit_transform(X)\n",
|
|
"kmeans = KMeans(random_state=1, n_clusters=4, init='k-means++', n_init=5)\n",
|
|
"kmeans.fit(X_scaled)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "X44fDpkqEPdY"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df['cluster9'] = kmeans.predict(X_scaled)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 368
|
|
},
|
|
"colab_type": "code",
|
|
"id": "SiKXkI3bEj34",
|
|
"outputId": "e8fb6558-9ea0-442c-9461-e1f83f5a887d"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"scatter_plot = alt.Chart(df).mark_circle()\n",
|
|
"scatter_plot.encode(x='Average total business income', y='Average total business expenses',color='cluster9:N',\n",
|
|
" tooltip=['Postcode', 'cluster9', 'Average total business income', 'Average total business expenses']\n",
|
|
").interactive() "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "eyoC0HiXElOC"
|
|
},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"collapsed_sections": [],
|
|
"name": "Exercise5_5.ipynb",
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 1
|
|
}
|