mirror of
https://github.com/fenago/data-science.git
synced 2026-05-06 01:22:41 +00:00
613 lines
13 KiB
Plaintext
613 lines
13 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "HEiOAwQPW0qb"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "j7whidfaYjns"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"file_url = '../dataset/Speed_Dating_Data.csv'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "nFMz2jNVt-xy"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = pd.read_csv(file_url)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 253
|
|
},
|
|
"colab_type": "code",
|
|
"id": "n0xY9ATvN6-M",
|
|
"outputId": "60ba0a8d-204b-4c4f-af64-4d2bea534af0"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 34
|
|
},
|
|
"colab_type": "code",
|
|
"id": "_ciEFAv5atrV",
|
|
"outputId": "178d28f1-4fcd-41b3-a2e1-1fff6195523a"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 34
|
|
},
|
|
"colab_type": "code",
|
|
"id": "lqaDAiGzYrOS",
|
|
"outputId": "13c51cb5-23da-4ae8-b33c-a77367c0f933"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.duplicated().sum()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 34
|
|
},
|
|
"colab_type": "code",
|
|
"id": "d4ctPmu4ZWJ8",
|
|
"outputId": "4906a1a5-5ae8-4e94-bad1-e04c31da3438"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.duplicated(subset=['iid','id','partner','pid']).sum()\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "zw6b3FrE6Z_-"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"scale_1_10 = ['imprace', 'imprelig', 'sports', 'tvsports', 'exercise', 'dining',\n",
|
|
"'museums', 'art', 'hiking', 'gaming', 'clubbing', 'reading', 'tv', \n",
|
|
"'theater', 'movies', 'concerts', 'music', 'shopping', 'yoga',\n",
|
|
"'exphappy', 'satis_2']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "T9mY-J2rN1eF"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def check_range(column, min_value, max_value):\n",
|
|
" return (column < min_value) | (column >max_value)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 34
|
|
},
|
|
"colab_type": "code",
|
|
"id": "-5aPwH8aUSvo",
|
|
"outputId": "76005c6e-2d51-4c99-b06d-3f625c24db1c"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"unexpected_mask = check_range(df['imprace'], 1, 10)\n",
|
|
"unexpected_mask.sum()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "AGvPiWPDTZXr"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def print_unexpected(df, col_name, unexpected_mask):\n",
|
|
" if unexpected_mask.sum() > 0:\n",
|
|
" print(col_name)\n",
|
|
" print(unexpected_mask.sum())\n",
|
|
" print(df.loc[unexpected_mask,col_name].unique())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 68
|
|
},
|
|
"colab_type": "code",
|
|
"id": "tIMHIsNqUjQ1",
|
|
"outputId": "77cf6a47-00aa-43be-8cbf-237e6cb24130"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"print_unexpected(df, 'imprace', unexpected_mask)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "KB4iQHewT4OZ"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def check_ranges(df, col_list, min_value, max_value):\n",
|
|
" for col_name in col_list:\n",
|
|
" unexpected_mask = check_range(df[col_name], min_value, max_value)\n",
|
|
" print_unexpected(df, col_name, unexpected_mask)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 578
|
|
},
|
|
"colab_type": "code",
|
|
"id": "bqzkToaoUII-",
|
|
"outputId": "7f6e1039-1e61-47fc-c920-2b2def608199"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"check_ranges(df, scale_1_10, 1, 10)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "BIOoCo6zWqeZ"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def replace_value(df, col_name, incorrect_value, new_value):\n",
|
|
" df.loc[df[col_name] == incorrect_value, col_name] = new_value\n",
|
|
" print(col_name)\n",
|
|
" print(df[col_name].unique())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 51
|
|
},
|
|
"colab_type": "code",
|
|
"id": "a5u1zYGQP0X_",
|
|
"outputId": "596e58cf-5e58-482c-93b0-7985b4c70c45"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"replace_value(df, 'gaming', 14, 10)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 51
|
|
},
|
|
"colab_type": "code",
|
|
"id": "hplI-CLuW_ms",
|
|
"outputId": "92d8f6b3-4750-46e5-e558-b5d37cae15e1"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"replace_value(df, 'reading', 13, 10)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "H9TkNHqZSo03"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"for suffix in ['1_1', '1_2', '1_3', '1_s', '2_1', '2_2', '2_3', '4_1', '4_2', '4_3', '7_2', '7_3']:\n",
|
|
" suffix_cols = [col for col in df.columns if col.endswith(suffix)]\n",
|
|
" check_ranges(df, suffix_cols, 0, 100)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 272
|
|
},
|
|
"colab_type": "code",
|
|
"id": "X8aBm1hTVK0Q",
|
|
"outputId": "8c96fb93-6fe8-443f-ecdb-6e376640f567"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"for suffix in ['3_1', '3_2', '3_3', '5_1', '5_2', '5_3', '3_s']:\n",
|
|
" suffix_cols = [col for col in df.columns if col.endswith(suffix)]\n",
|
|
" check_ranges(df, suffix_cols, 1, 10)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 187
|
|
},
|
|
"colab_type": "code",
|
|
"id": "EOmcJR8k6Kl8",
|
|
"outputId": "c57eea2e-1cde-4b75-9236-54c4deafa631"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"for col_name in ['attr3_3', 'sinc3_3', 'intel3_3', 'fun3_3', 'amb3_3']:\n",
|
|
" replace_value(df, col_name, 12, 10)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 221
|
|
},
|
|
"colab_type": "code",
|
|
"id": "Rv1a7YLL63I8",
|
|
"outputId": "0578236d-45fe-4d47-dfa1-23b16dabab1f"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.dtypes"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "uW87dAYWYp5F"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"num_cols = ['round', 'order', 'int_corr', 'age', 'mn_sat', 'income', 'expnum']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "o25dBAOEdbv0"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"cat_cols = df.columns.difference(num_cols)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "0AJNMy0Ec6ea"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"for col_name in cat_cols:\n",
|
|
" df[col_name] = df[col_name].astype('category')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 408
|
|
},
|
|
"colab_type": "code",
|
|
"id": "973po6fCdCXa",
|
|
"outputId": "e7faa0f4-c875-4715-d207-dfde3bc3d091"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.dtypes"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 272
|
|
},
|
|
"colab_type": "code",
|
|
"id": "q1a8onHi79Z7",
|
|
"outputId": "a5ebc799-ec6e-461b-bbcc-806ba2d88c70"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df[num_cols].isna().sum()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 612
|
|
},
|
|
"colab_type": "code",
|
|
"id": "XPAAELUR9_qX",
|
|
"outputId": "be5aef72-f7f0-42c3-c473-6e5024e7cce4"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df['int_corr'].unique()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "1zTWk7dtBbMe"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"int_corr_mask = df['int_corr'].isna()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 34
|
|
},
|
|
"colab_type": "code",
|
|
"id": "apH4JpnGBgNG",
|
|
"outputId": "d0533a46-3bba-4e80-b570-0e3cf631a82e"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"int_corr_mask.sum()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 51
|
|
},
|
|
"colab_type": "code",
|
|
"id": "I0wuvkFHBlml",
|
|
"outputId": "90ef52c8-e93a-4092-f302-72bf5ddf7348"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"int_corr_mean = df['int_corr'].mean()\n",
|
|
"print(int_corr_mean)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "Q_m65ZKLB-jx"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df['int_corr'].fillna(int_corr_mean, inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 34
|
|
},
|
|
"colab_type": "code",
|
|
"id": "Jr6BfU45CDQw",
|
|
"outputId": "5e1edb25-532a-4cc5-a180-f5d2100de1ed"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df['int_corr'].isna().sum()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "BSsH50CEZaDg"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"missing_num_cols = ['age', 'mn_sat', 'income', 'expnum']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 1000
|
|
},
|
|
"colab_type": "code",
|
|
"id": "1SAofEd0CYdM",
|
|
"outputId": "7f640988-fc41-4718-a131-51b36f2c0105"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"for col_name in missing_num_cols:\n",
|
|
" print(col_name)\n",
|
|
" print(df[col_name].unique())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 289
|
|
},
|
|
"colab_type": "code",
|
|
"id": "MH3Qd9LIHeKF",
|
|
"outputId": "324696ab-c891-44c4-e4db-b38cc081d157"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"for col_name in missing_num_cols:\n",
|
|
" col_median = df[col_name].median()\n",
|
|
" df[col_name].fillna(col_median, inplace=True)\n",
|
|
" print(col_name)\n",
|
|
" print(col_median)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 289
|
|
},
|
|
"colab_type": "code",
|
|
"id": "T-CLjjkVHrfq",
|
|
"outputId": "581ddf09-ff1b-44ce-8adb-d5f55d5a648e"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"for col_name in missing_num_cols:\n",
|
|
" print(col_name)\n",
|
|
" print(df[col_name].isna().sum())"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"collapsed_sections": [],
|
|
"name": "Activity11.1.ipynb",
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 1
|
|
}
|