mirror of
https://github.com/fenago/data-science.git
synced 2026-05-06 01:22:41 +00:00
267 lines
5.1 KiB
Plaintext
267 lines
5.1 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "HEiOAwQPW0qb"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "j7whidfaYjns"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"file_url = '../dataset/horse-colic.data'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "nFMz2jNVt-xy"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = pd.read_csv(file_url, header=None, sep='\\s+', prefix='X')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 204
|
|
},
|
|
"colab_type": "code",
|
|
"id": "n0xY9ATvN6-M",
|
|
"outputId": "8deac724-2a1e-4a73-abde-a2cca7f9cfe3"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "ASEdozexcGbY"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = pd.read_csv(file_url, header=None, sep='\\s+', prefix='X', na_values='?')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 204
|
|
},
|
|
"colab_type": "code",
|
|
"id": "NLiPeTsPcHpg",
|
|
"outputId": "26468262-5891-4d2a-b801-a2af377f0f7f"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 510
|
|
},
|
|
"colab_type": "code",
|
|
"id": "Rv1a7YLL63I8",
|
|
"outputId": "4e5dd925-59d8-43b2-ac37-5221de09052c"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.dtypes"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 510
|
|
},
|
|
"colab_type": "code",
|
|
"id": "q1a8onHi79Z7",
|
|
"outputId": "5c3137bc-455e-4f93-f9c2-9aebffd34141"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.isna().sum()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "1zTWk7dtBbMe"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"x0_mask = df['X0'].isna()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 34
|
|
},
|
|
"colab_type": "code",
|
|
"id": "apH4JpnGBgNG",
|
|
"outputId": "c808828a-7081-478e-b4a2-1de161ae1619"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"x0_mask.sum()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 34
|
|
},
|
|
"colab_type": "code",
|
|
"id": "I0wuvkFHBlml",
|
|
"outputId": "4bb32466-71e6-496b-fceb-d0d3e138d25f"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"x0_median = df['X0'].median()\n",
|
|
"print(x0_median)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "Q_m65ZKLB-jx"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df['X0'].fillna(x0_median, inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 34
|
|
},
|
|
"colab_type": "code",
|
|
"id": "Jr6BfU45CDQw",
|
|
"outputId": "01236dae-58f5-4768-81f7-66515325fb7b"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df['X0'].isna().sum()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 969
|
|
},
|
|
"colab_type": "code",
|
|
"id": "MH3Qd9LIHeKF",
|
|
"outputId": "4bd4e62a-1bef-4301-faa8-07607c134a66"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"for col_name in df.columns:\n",
|
|
" col_median = df[col_name].median()\n",
|
|
" df[col_name].fillna(col_median, inplace=True)\n",
|
|
" print(col_name)\n",
|
|
" print(col_median)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 510
|
|
},
|
|
"colab_type": "code",
|
|
"id": "T-CLjjkVHrfq",
|
|
"outputId": "04e8837c-39ad-49e6-ada4-0f0352151761"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.isna().sum()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"collapsed_sections": [],
|
|
"name": "Exercise11.04.ipynb",
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 1
|
|
}
|