Files
fenago f3b24b4b7f added
2021-02-07 15:16:01 +05:00

283 lines
5.5 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "i0_6mTjFH-_v"
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.cluster import KMeans\n",
"import altair as alt\n",
"from sklearn.preprocessing import StandardScaler"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "pd3ELQrbIUvQ"
},
"outputs": [],
"source": [
"file_url = '../DataSet/german.data-numeric'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 71
},
"colab_type": "code",
"id": "652VWTRkIZOc",
"outputId": "9fae4df0-e2b3-4f8b-8c98-bf541d1dbf90"
},
"outputs": [],
"source": [
"df = pd.read_csv(file_url, header=None, sep='\\s\\s+', prefix='X')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"colab_type": "code",
"id": "FKd0dGxfIat_",
"outputId": "a34b7d62-6ab2-4c9a-e194-e185ad0b4ab3"
},
"outputs": [],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "QQs2uYP2Jpe6"
},
"outputs": [],
"source": [
"X = df[['X3', 'X9']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "A3JCcQwtK2M_"
},
"outputs": [],
"source": [
"standard_scaler = StandardScaler()\n",
"X_scaled = standard_scaler.fit_transform(X)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "FAxew7byK8_k"
},
"outputs": [],
"source": [
"clusters = pd.DataFrame()\n",
"inertia = []"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "YxB1jOrUK_Ky"
},
"outputs": [],
"source": [
"clusters['cluster_range'] = range(1, 15)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "Wj9qG7XWLAis"
},
"outputs": [],
"source": [
"for k in clusters['cluster_range']:\n",
" kmeans = KMeans(n_clusters=k, random_state=8).fit(X_scaled)\n",
" inertia.append(kmeans.inertia_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "n8a_y67oLCEb"
},
"outputs": [],
"source": [
"clusters['inertia'] = inertia"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 483
},
"colab_type": "code",
"id": "ChBnw2jMLDq8",
"outputId": "42868bc9-afee-4fcc-b2ed-352329e12880"
},
"outputs": [],
"source": [
"clusters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 368
},
"colab_type": "code",
"id": "xm0azWh-LEwX",
"outputId": "678485f8-dea9-4528-dcf8-2d6b4733611b"
},
"outputs": [],
"source": [
"alt.Chart(clusters).mark_line().encode(alt.X('cluster_range'), alt.Y('inertia'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "nJF3LIAFLctS"
},
"outputs": [],
"source": [
"clusters_number = 5"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 68
},
"colab_type": "code",
"id": "gZZnYiSyL60J",
"outputId": "ed057701-8ac7-47bf-952c-9434feb2520e"
},
"outputs": [],
"source": [
"kmeans = KMeans(random_state=1, n_clusters=clusters_number, init='k-means++', n_init=50, max_iter=1000)\n",
"kmeans.fit(X_scaled)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "klTbcYaQL8F6"
},
"outputs": [],
"source": [
"df['cluster'] = kmeans.predict(X_scaled)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 368
},
"colab_type": "code",
"id": "IKt44R6CL9cY",
"outputId": "a5570db0-7565-462f-d78c-26f554fc87ca"
},
"outputs": [],
"source": [
"scatter_plot = alt.Chart(df).mark_circle()\n",
"scatter_plot.encode(x='X3', y='X9',color='cluster:N')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "3_Np9pJiL-6Z"
},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "Activity5_1.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.6"
}
},
"nbformat": 4,
"nbformat_minor": 1
}