mlessentials/Lab11/Exercise11.04/Exercise11_04.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "HEiOAwQPW0qb"
   },
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "j7whidfaYjns"
   },
   "outputs": [],
   "source": [
    "file_url = '../dataset/horse-colic.data'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "nFMz2jNVt-xy"
   },
   "outputs": [],
   "source": [
    "df = pd.read_csv(file_url, header=None, sep='\\s+', prefix='X')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 204
    },
    "colab_type": "code",
    "id": "n0xY9ATvN6-M",
    "outputId": "8deac724-2a1e-4a73-abde-a2cca7f9cfe3"
   },
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "ASEdozexcGbY"
   },
   "outputs": [],
   "source": [
    "df = pd.read_csv(file_url, header=None, sep='\\s+', prefix='X', na_values='?')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 204
    },
    "colab_type": "code",
    "id": "NLiPeTsPcHpg",
    "outputId": "26468262-5891-4d2a-b801-a2af377f0f7f"
   },
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 510
    },
    "colab_type": "code",
    "id": "Rv1a7YLL63I8",
    "outputId": "4e5dd925-59d8-43b2-ac37-5221de09052c"
   },
   "outputs": [],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 510
    },
    "colab_type": "code",
    "id": "q1a8onHi79Z7",
    "outputId": "5c3137bc-455e-4f93-f9c2-9aebffd34141"
   },
   "outputs": [],
   "source": [
    "df.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "1zTWk7dtBbMe"
   },
   "outputs": [],
   "source": [
    "x0_mask = df['X0'].isna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "id": "apH4JpnGBgNG",
    "outputId": "c808828a-7081-478e-b4a2-1de161ae1619"
   },
   "outputs": [],
   "source": [
    "x0_mask.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "id": "I0wuvkFHBlml",
    "outputId": "4bb32466-71e6-496b-fceb-d0d3e138d25f"
   },
   "outputs": [],
   "source": [
    "x0_median = df['X0'].median()\n",
    "print(x0_median)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "Q_m65ZKLB-jx"
   },
   "outputs": [],
   "source": [
    "df['X0'].fillna(x0_median, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "id": "Jr6BfU45CDQw",
    "outputId": "01236dae-58f5-4768-81f7-66515325fb7b"
   },
   "outputs": [],
   "source": [
    "df['X0'].isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 969
    },
    "colab_type": "code",
    "id": "MH3Qd9LIHeKF",
    "outputId": "4bd4e62a-1bef-4301-faa8-07607c134a66"
   },
   "outputs": [],
   "source": [
    "for col_name in df.columns:\n",
    "  col_median = df[col_name].median()\n",
    "  df[col_name].fillna(col_median, inplace=True)\n",
    "  print(col_name)\n",
    "  print(col_median)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 510
    },
    "colab_type": "code",
    "id": "T-CLjjkVHrfq",
    "outputId": "04e8837c-39ad-49e6-ada4-0f0352151761"
   },
   "outputs": [],
   "source": [
    "df.isna().sum()"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "Exercise11.04.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}