{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "HEiOAwQPW0qb" }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "j7whidfaYjns" }, "outputs": [], "source": [ "file_url = '../dataset/breast-cancer-wisconsin.data'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "nFMz2jNVt-xy" }, "outputs": [], "source": [ "df = pd.read_csv(file_url, header=None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "Ysg_abDDATWd" }, "outputs": [], "source": [ "col_names = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size',\n", "'Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class'] " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "VwROQvaMHWUq" }, "outputs": [], "source": [ "df.columns = col_names" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "colab_type": "code", "id": "jpSNmaLTI0Tc", "outputId": "5c2b15a8-0cb1-4952-97cf-a3634bd4fd23" }, "outputs": [], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 221 }, "colab_type": "code", "id": "FtMSHk6cHgwK", "outputId": "e52b0624-a90b-4615-a30f-d149d8a41f55" }, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "colab_type": "code", "id": "5aPvl9bpuL5R", "outputId": "89abe994-3cf8-4210-96ab-7b3d8c8f470e" }, "outputs": [], "source": [ "df.duplicated().sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 314 }, "colab_type": "code", "id": "YZQGk3-uH-uw", "outputId": "157d6c5a-1ece-4f0d-e435-074b775207ac" }, "outputs": [], "source": [ "df.loc[df.duplicated()]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 314 }, "colab_type": "code", "id": "s27zOqKmIKd0", "outputId": "52c71d13-4b6a-4302-cbb9-0dc31cb14d48" }, "outputs": [], "source": [ "df.loc[df.duplicated(keep='last')]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "SoNSF72yIfl6" }, "outputs": [], "source": [ "df_unique = df.drop_duplicates(keep='first')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "colab_type": "code", "id": "-NBl7SK9JWKa", "outputId": "feddf245-061b-4cf9-9d78-748fc675ffde" }, "outputs": [], "source": [ "df_unique.shape" ] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "Exercise11.01.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.6" } }, "nbformat": 4, "nbformat_minor": 1 }