mirror of
https://github.com/fenago/data-science.git
synced 2026-05-04 08:31:59 +00:00
283 lines
5.5 KiB
Plaintext
283 lines
5.5 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "i0_6mTjFH-_v"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"from sklearn.cluster import KMeans\n",
|
|
"import altair as alt\n",
|
|
"from sklearn.preprocessing import StandardScaler"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "pd3ELQrbIUvQ"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"file_url = '../DataSet/german.data-numeric'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 71
|
|
},
|
|
"colab_type": "code",
|
|
"id": "652VWTRkIZOc",
|
|
"outputId": "9fae4df0-e2b3-4f8b-8c98-bf541d1dbf90"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = pd.read_csv(file_url, header=None, sep='\\s\\s+', prefix='X')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 204
|
|
},
|
|
"colab_type": "code",
|
|
"id": "FKd0dGxfIat_",
|
|
"outputId": "a34b7d62-6ab2-4c9a-e194-e185ad0b4ab3"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "QQs2uYP2Jpe6"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"X = df[['X3', 'X9']]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "A3JCcQwtK2M_"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"standard_scaler = StandardScaler()\n",
|
|
"X_scaled = standard_scaler.fit_transform(X)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "FAxew7byK8_k"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"clusters = pd.DataFrame()\n",
|
|
"inertia = []"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "YxB1jOrUK_Ky"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"clusters['cluster_range'] = range(1, 15)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "Wj9qG7XWLAis"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"for k in clusters['cluster_range']:\n",
|
|
" kmeans = KMeans(n_clusters=k, random_state=8).fit(X_scaled)\n",
|
|
" inertia.append(kmeans.inertia_)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "n8a_y67oLCEb"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"clusters['inertia'] = inertia"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 483
|
|
},
|
|
"colab_type": "code",
|
|
"id": "ChBnw2jMLDq8",
|
|
"outputId": "42868bc9-afee-4fcc-b2ed-352329e12880"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"clusters"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 368
|
|
},
|
|
"colab_type": "code",
|
|
"id": "xm0azWh-LEwX",
|
|
"outputId": "678485f8-dea9-4528-dcf8-2d6b4733611b"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"alt.Chart(clusters).mark_line().encode(alt.X('cluster_range'), alt.Y('inertia'))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "nJF3LIAFLctS"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"clusters_number = 5"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 68
|
|
},
|
|
"colab_type": "code",
|
|
"id": "gZZnYiSyL60J",
|
|
"outputId": "ed057701-8ac7-47bf-952c-9434feb2520e"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"kmeans = KMeans(random_state=1, n_clusters=clusters_number, init='k-means++', n_init=50, max_iter=1000)\n",
|
|
"kmeans.fit(X_scaled)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "klTbcYaQL8F6"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df['cluster'] = kmeans.predict(X_scaled)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 368
|
|
},
|
|
"colab_type": "code",
|
|
"id": "IKt44R6CL9cY",
|
|
"outputId": "a5570db0-7565-462f-d78c-26f554fc87ca"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"scatter_plot = alt.Chart(df).mark_circle()\n",
|
|
"scatter_plot.encode(x='X3', y='X9',color='cluster:N')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "3_Np9pJiL-6Z"
|
|
},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"collapsed_sections": [],
|
|
"name": "Activity5_1.ipynb",
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 1
|
|
}
|