Files
fenago f3b24b4b7f added
2021-02-07 15:16:01 +05:00

613 lines
13 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "HEiOAwQPW0qb"
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "j7whidfaYjns"
},
"outputs": [],
"source": [
"file_url = '../dataset/Speed_Dating_Data.csv'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "nFMz2jNVt-xy"
},
"outputs": [],
"source": [
"df = pd.read_csv(file_url)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 253
},
"colab_type": "code",
"id": "n0xY9ATvN6-M",
"outputId": "60ba0a8d-204b-4c4f-af64-4d2bea534af0"
},
"outputs": [],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"id": "_ciEFAv5atrV",
"outputId": "178d28f1-4fcd-41b3-a2e1-1fff6195523a"
},
"outputs": [],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"id": "lqaDAiGzYrOS",
"outputId": "13c51cb5-23da-4ae8-b33c-a77367c0f933"
},
"outputs": [],
"source": [
"df.duplicated().sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"id": "d4ctPmu4ZWJ8",
"outputId": "4906a1a5-5ae8-4e94-bad1-e04c31da3438"
},
"outputs": [],
"source": [
"df.duplicated(subset=['iid','id','partner','pid']).sum()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "zw6b3FrE6Z_-"
},
"outputs": [],
"source": [
"scale_1_10 = ['imprace', 'imprelig', 'sports', 'tvsports', 'exercise', 'dining',\n",
"'museums', 'art', 'hiking', 'gaming', 'clubbing', 'reading', 'tv', \n",
"'theater', 'movies', 'concerts', 'music', 'shopping', 'yoga',\n",
"'exphappy', 'satis_2']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "T9mY-J2rN1eF"
},
"outputs": [],
"source": [
"def check_range(column, min_value, max_value):\n",
" return (column < min_value) | (column >max_value)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"id": "-5aPwH8aUSvo",
"outputId": "76005c6e-2d51-4c99-b06d-3f625c24db1c"
},
"outputs": [],
"source": [
"unexpected_mask = check_range(df['imprace'], 1, 10)\n",
"unexpected_mask.sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "AGvPiWPDTZXr"
},
"outputs": [],
"source": [
"def print_unexpected(df, col_name, unexpected_mask):\n",
" if unexpected_mask.sum() > 0:\n",
" print(col_name)\n",
" print(unexpected_mask.sum())\n",
" print(df.loc[unexpected_mask,col_name].unique())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 68
},
"colab_type": "code",
"id": "tIMHIsNqUjQ1",
"outputId": "77cf6a47-00aa-43be-8cbf-237e6cb24130"
},
"outputs": [],
"source": [
"print_unexpected(df, 'imprace', unexpected_mask)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "KB4iQHewT4OZ"
},
"outputs": [],
"source": [
"def check_ranges(df, col_list, min_value, max_value):\n",
" for col_name in col_list:\n",
" unexpected_mask = check_range(df[col_name], min_value, max_value)\n",
" print_unexpected(df, col_name, unexpected_mask)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 578
},
"colab_type": "code",
"id": "bqzkToaoUII-",
"outputId": "7f6e1039-1e61-47fc-c920-2b2def608199"
},
"outputs": [],
"source": [
"check_ranges(df, scale_1_10, 1, 10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "BIOoCo6zWqeZ"
},
"outputs": [],
"source": [
"def replace_value(df, col_name, incorrect_value, new_value):\n",
" df.loc[df[col_name] == incorrect_value, col_name] = new_value\n",
" print(col_name)\n",
" print(df[col_name].unique())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 51
},
"colab_type": "code",
"id": "a5u1zYGQP0X_",
"outputId": "596e58cf-5e58-482c-93b0-7985b4c70c45"
},
"outputs": [],
"source": [
"replace_value(df, 'gaming', 14, 10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 51
},
"colab_type": "code",
"id": "hplI-CLuW_ms",
"outputId": "92d8f6b3-4750-46e5-e558-b5d37cae15e1"
},
"outputs": [],
"source": [
"replace_value(df, 'reading', 13, 10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "H9TkNHqZSo03"
},
"outputs": [],
"source": [
"for suffix in ['1_1', '1_2', '1_3', '1_s', '2_1', '2_2', '2_3', '4_1', '4_2', '4_3', '7_2', '7_3']:\n",
" suffix_cols = [col for col in df.columns if col.endswith(suffix)]\n",
" check_ranges(df, suffix_cols, 0, 100)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 272
},
"colab_type": "code",
"id": "X8aBm1hTVK0Q",
"outputId": "8c96fb93-6fe8-443f-ecdb-6e376640f567"
},
"outputs": [],
"source": [
"for suffix in ['3_1', '3_2', '3_3', '5_1', '5_2', '5_3', '3_s']:\n",
" suffix_cols = [col for col in df.columns if col.endswith(suffix)]\n",
" check_ranges(df, suffix_cols, 1, 10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 187
},
"colab_type": "code",
"id": "EOmcJR8k6Kl8",
"outputId": "c57eea2e-1cde-4b75-9236-54c4deafa631"
},
"outputs": [],
"source": [
"for col_name in ['attr3_3', 'sinc3_3', 'intel3_3', 'fun3_3', 'amb3_3']:\n",
" replace_value(df, col_name, 12, 10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 221
},
"colab_type": "code",
"id": "Rv1a7YLL63I8",
"outputId": "0578236d-45fe-4d47-dfa1-23b16dabab1f"
},
"outputs": [],
"source": [
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "uW87dAYWYp5F"
},
"outputs": [],
"source": [
"num_cols = ['round', 'order', 'int_corr', 'age', 'mn_sat', 'income', 'expnum']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "o25dBAOEdbv0"
},
"outputs": [],
"source": [
"cat_cols = df.columns.difference(num_cols)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "0AJNMy0Ec6ea"
},
"outputs": [],
"source": [
"for col_name in cat_cols:\n",
" df[col_name] = df[col_name].astype('category')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 408
},
"colab_type": "code",
"id": "973po6fCdCXa",
"outputId": "e7faa0f4-c875-4715-d207-dfde3bc3d091"
},
"outputs": [],
"source": [
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 272
},
"colab_type": "code",
"id": "q1a8onHi79Z7",
"outputId": "a5ebc799-ec6e-461b-bbcc-806ba2d88c70"
},
"outputs": [],
"source": [
"df[num_cols].isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 612
},
"colab_type": "code",
"id": "XPAAELUR9_qX",
"outputId": "be5aef72-f7f0-42c3-c473-6e5024e7cce4"
},
"outputs": [],
"source": [
"df['int_corr'].unique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "1zTWk7dtBbMe"
},
"outputs": [],
"source": [
"int_corr_mask = df['int_corr'].isna()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"id": "apH4JpnGBgNG",
"outputId": "d0533a46-3bba-4e80-b570-0e3cf631a82e"
},
"outputs": [],
"source": [
"int_corr_mask.sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 51
},
"colab_type": "code",
"id": "I0wuvkFHBlml",
"outputId": "90ef52c8-e93a-4092-f302-72bf5ddf7348"
},
"outputs": [],
"source": [
"int_corr_mean = df['int_corr'].mean()\n",
"print(int_corr_mean)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "Q_m65ZKLB-jx"
},
"outputs": [],
"source": [
"df['int_corr'].fillna(int_corr_mean, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"id": "Jr6BfU45CDQw",
"outputId": "5e1edb25-532a-4cc5-a180-f5d2100de1ed"
},
"outputs": [],
"source": [
"df['int_corr'].isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "BSsH50CEZaDg"
},
"outputs": [],
"source": [
"missing_num_cols = ['age', 'mn_sat', 'income', 'expnum']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"colab_type": "code",
"id": "1SAofEd0CYdM",
"outputId": "7f640988-fc41-4718-a131-51b36f2c0105"
},
"outputs": [],
"source": [
"for col_name in missing_num_cols:\n",
" print(col_name)\n",
" print(df[col_name].unique())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 289
},
"colab_type": "code",
"id": "MH3Qd9LIHeKF",
"outputId": "324696ab-c891-44c4-e4db-b38cc081d157"
},
"outputs": [],
"source": [
"for col_name in missing_num_cols:\n",
" col_median = df[col_name].median()\n",
" df[col_name].fillna(col_median, inplace=True)\n",
" print(col_name)\n",
" print(col_median)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 289
},
"colab_type": "code",
"id": "T-CLjjkVHrfq",
"outputId": "581ddf09-ff1b-44ce-8adb-d5f55d5a648e"
},
"outputs": [],
"source": [
"for col_name in missing_num_cols:\n",
" print(col_name)\n",
" print(df[col_name].isna().sum())"
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "Activity11.1.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.6"
}
},
"nbformat": 4,
"nbformat_minor": 1
}