Files
fenago f3b24b4b7f added
2021-02-07 15:16:01 +05:00

792 lines
19 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "HEiOAwQPW0qb"
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "j7whidfaYjns"
},
"outputs": [],
"source": [
"disp_url = '../Dataset/disp.csv'\n",
"trans_url = '../Dataset/trans.csv'\n",
"account_url = '../Dataset/account.csv'\n",
"client_url = '../Dataset/client.csv'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 71
},
"colab_type": "code",
"executionInfo": {
"elapsed": 8712,
"status": "ok",
"timestamp": 1574650430652,
"user": {
"displayName": "Anthony So",
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
"userId": "11809607246124237079"
},
"user_tz": -660
},
"id": "nFMz2jNVt-xy",
"outputId": "95f4874b-575f-4b8d-ede4-734116c4a502"
},
"outputs": [],
"source": [
"df_disp = pd.read_csv(disp_url, sep=';')\n",
"df_trans = pd.read_csv(trans_url, sep=';')\n",
"df_account = pd.read_csv(account_url, sep=';')\n",
"df_client = pd.read_csv(client_url, sep=';')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"colab_type": "code",
"executionInfo": {
"elapsed": 8706,
"status": "ok",
"timestamp": 1574650430653,
"user": {
"displayName": "Anthony So",
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
"userId": "11809607246124237079"
},
"user_tz": -660
},
"id": "xAqQ7rpV0nkT",
"outputId": "d059dc0b-ace2-467e-c3cd-8361faf44170"
},
"outputs": [],
"source": [
"df_trans.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"executionInfo": {
"elapsed": 8701,
"status": "ok",
"timestamp": 1574650430654,
"user": {
"displayName": "Anthony So",
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
"userId": "11809607246124237079"
},
"user_tz": -660
},
"id": "zybRLGrd2bPf",
"outputId": "8e1da605-af26-46e1-9bf1-6e5b9a337857"
},
"outputs": [],
"source": [
"df_trans.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"colab_type": "code",
"executionInfo": {
"elapsed": 8696,
"status": "ok",
"timestamp": 1574650430655,
"user": {
"displayName": "Anthony So",
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
"userId": "11809607246124237079"
},
"user_tz": -660
},
"id": "IvvSvYl42Koi",
"outputId": "c90574b1-5620-4d6b-e13b-ce8ba5ae66f0"
},
"outputs": [],
"source": [
"df_account.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "Fk2bTvEp17Bp"
},
"outputs": [],
"source": [
"df_trans_acc = pd.merge(df_trans, df_account, how='left', on='account_id')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"executionInfo": {
"elapsed": 9099,
"status": "ok",
"timestamp": 1574650431066,
"user": {
"displayName": "Anthony So",
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
"userId": "11809607246124237079"
},
"user_tz": -660
},
"id": "9BN6zHeL2lOx",
"outputId": "357d0b26-daed-429e-dd5f-7f9c0e5a4a27"
},
"outputs": [],
"source": [
"df_trans_acc.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"colab_type": "code",
"executionInfo": {
"elapsed": 9095,
"status": "ok",
"timestamp": 1574650431067,
"user": {
"displayName": "Anthony So",
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
"userId": "11809607246124237079"
},
"user_tz": -660
},
"id": "98f8YCkO2q41",
"outputId": "213b7737-8471-49f8-f25a-7aa78acac4f8"
},
"outputs": [],
"source": [
"df_disp.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "ap--Zddi3KYG"
},
"outputs": [],
"source": [
"df_disp_owner = df_disp[df_disp['type'] == 'OWNER']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"executionInfo": {
"elapsed": 9088,
"status": "ok",
"timestamp": 1574650431068,
"user": {
"displayName": "Anthony So",
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
"userId": "11809607246124237079"
},
"user_tz": -660
},
"id": "8rvJpD6V3VWU",
"outputId": "e950a8f3-2d85-4c82-b9b6-d62b6df6cc2d"
},
"outputs": [],
"source": [
"df_disp_owner.duplicated(subset='account_id').sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"executionInfo": {
"elapsed": 9081,
"status": "ok",
"timestamp": 1574650431069,
"user": {
"displayName": "Anthony So",
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
"userId": "11809607246124237079"
},
"user_tz": -660
},
"id": "aeshCYj63Ezw",
"outputId": "27de9e85-0643-4997-e318-f3d6bd7b3dd6"
},
"outputs": [],
"source": [
"df_trans_acc_disp = pd.merge(df_trans_acc, df_disp_owner, how='left', on='account_id')\n",
"df_trans_acc_disp.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"colab_type": "code",
"executionInfo": {
"elapsed": 9076,
"status": "ok",
"timestamp": 1574650431070,
"user": {
"displayName": "Anthony So",
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
"userId": "11809607246124237079"
},
"user_tz": -660
},
"id": "J-_jUOwP3GGJ",
"outputId": "3b19d650-c852-41d9-fa99-7123158a34ce"
},
"outputs": [],
"source": [
"df_client.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"executionInfo": {
"elapsed": 9440,
"status": "ok",
"timestamp": 1574650431439,
"user": {
"displayName": "Anthony So",
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
"userId": "11809607246124237079"
},
"user_tz": -660
},
"id": "35JidayO4C5X",
"outputId": "da451ba8-56fd-4d66-f147-5d3c8f817991"
},
"outputs": [],
"source": [
"df_merged = pd.merge(df_trans_acc_disp, df_client, how='left', on=['client_id', 'district_id'])\n",
"df_merged.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 85
},
"colab_type": "code",
"executionInfo": {
"elapsed": 9436,
"status": "ok",
"timestamp": 1574650431440,
"user": {
"displayName": "Anthony So",
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
"userId": "11809607246124237079"
},
"user_tz": -660
},
"id": "8wGMCNV547RR",
"outputId": "1069eb5a-7f28-47b4-8cf5-aa03fee7eac9"
},
"outputs": [],
"source": [
"df_merged.columns"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "2oyeYQnB5ODm"
},
"outputs": [],
"source": [
"df_merged.rename(columns={'date_x': 'trans_date', 'type_x': 'trans_type', 'date_y':'account_creation', 'type_y':'client_type'}, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 309
},
"colab_type": "code",
"executionInfo": {
"elapsed": 9997,
"status": "ok",
"timestamp": 1574650432011,
"user": {
"displayName": "Anthony So",
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
"userId": "11809607246124237079"
},
"user_tz": -660
},
"id": "tiPA68we6G-v",
"outputId": "be267f16-4953-459f-a9d9-2139b91f4f31"
},
"outputs": [],
"source": [
"df_merged.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 323
},
"colab_type": "code",
"executionInfo": {
"elapsed": 9993,
"status": "ok",
"timestamp": 1574650432012,
"user": {
"displayName": "Anthony So",
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
"userId": "11809607246124237079"
},
"user_tz": -660
},
"id": "GV_1Jh4H6Qy4",
"outputId": "e0a7024c-8bc7-429c-a4c8-c012c21347fc"
},
"outputs": [],
"source": [
"df_merged.dtypes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "tLCnpFRU5lna"
},
"outputs": [],
"source": [
"df_merged['trans_date'] = pd.to_datetime(df_merged['trans_date'], format=\"%y%m%d\")\n",
"df_merged['account_creation'] = pd.to_datetime(df_merged['account_creation'], format=\"%y%m%d\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 323
},
"colab_type": "code",
"executionInfo": {
"elapsed": 9986,
"status": "ok",
"timestamp": 1574650432014,
"user": {
"displayName": "Anthony So",
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
"userId": "11809607246124237079"
},
"user_tz": -660
},
"id": "6f791uGS6P25",
"outputId": "37cec372-1702-40de-8981-c1320a696ff1"
},
"outputs": [],
"source": [
"df_merged.dtypes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "p3Z3dB8h7lXd"
},
"outputs": [],
"source": [
"df_merged['is_female'] = (df_merged['birth_number'] % 10000) / 5000 > 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 119
},
"colab_type": "code",
"executionInfo": {
"elapsed": 9980,
"status": "ok",
"timestamp": 1574650432015,
"user": {
"displayName": "Anthony So",
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
"userId": "11809607246124237079"
},
"user_tz": -660
},
"id": "4_Ml3J8f7PI_",
"outputId": "68f53d4a-618f-4c25-b9f7-9ded0650191a"
},
"outputs": [],
"source": [
"df_merged['birth_number'].head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "X4yKolNl8ys8"
},
"outputs": [],
"source": [
"df_merged.loc[df_merged['is_female'] == True, 'birth_number'] -= 5000"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 119
},
"colab_type": "code",
"executionInfo": {
"elapsed": 9973,
"status": "ok",
"timestamp": 1574650432016,
"user": {
"displayName": "Anthony So",
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
"userId": "11809607246124237079"
},
"user_tz": -660
},
"id": "HxWTe7zS91aB",
"outputId": "4260e7bd-1b96-49d6-a392-a7efa8ac9e3d"
},
"outputs": [],
"source": [
"df_merged['birth_number'].head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 221
},
"colab_type": "code",
"executionInfo": {
"elapsed": 9967,
"status": "ok",
"timestamp": 1574650432017,
"user": {
"displayName": "Anthony So",
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
"userId": "11809607246124237079"
},
"user_tz": -660
},
"id": "A8cKsZf86v9u",
"outputId": "75087216-14a2-40fa-966d-960ffeb45acb"
},
"outputs": [],
"source": [
"pd.to_datetime(df_merged['birth_number'], format=\"%y%m%d\", errors='coerce')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 119
},
"colab_type": "code",
"executionInfo": {
"elapsed": 10897,
"status": "ok",
"timestamp": 1574650432952,
"user": {
"displayName": "Anthony So",
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
"userId": "11809607246124237079"
},
"user_tz": -660
},
"id": "F6YkBC4eClRA",
"outputId": "f676d8f3-f03a-4bbf-f93a-69e2b032c225"
},
"outputs": [],
"source": [
"df_merged['birth_number'] = df_merged['birth_number'].astype(str)\n",
"df_merged['birth_number'].head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 119
},
"colab_type": "code",
"executionInfo": {
"elapsed": 10893,
"status": "ok",
"timestamp": 1574650432953,
"user": {
"displayName": "Anthony So",
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
"userId": "11809607246124237079"
},
"user_tz": -660
},
"id": "RkEYHyd4Dtz7",
"outputId": "5cba124b-35fc-4a9c-f790-a30ff44a6cb0"
},
"outputs": [],
"source": [
"import numpy as np\n",
"df_merged.loc[df_merged['birth_number'] == 'nan', 'birth_number'] = np.nan\n",
"df_merged['birth_number'].head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 119
},
"colab_type": "code",
"executionInfo": {
"elapsed": 11346,
"status": "ok",
"timestamp": 1574650433411,
"user": {
"displayName": "Anthony So",
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
"userId": "11809607246124237079"
},
"user_tz": -660
},
"id": "y2LZf7Q3CIU7",
"outputId": "1e4fa5c2-3a93-4015-a27f-067466bdc28e"
},
"outputs": [],
"source": [
"df_merged.loc[~df_merged['birth_number'].isna(), 'birth_number'] = '19' + df_merged.loc[~df_merged['birth_number'].isna(), 'birth_number']\n",
"df_merged['birth_number'].head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 374
},
"colab_type": "code",
"executionInfo": {
"elapsed": 11341,
"status": "ok",
"timestamp": 1574650433411,
"user": {
"displayName": "Anthony So",
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
"userId": "11809607246124237079"
},
"user_tz": -660
},
"id": "14FMFck_D5pE",
"outputId": "64ade2cc-2cea-43eb-c7eb-37931a93c001"
},
"outputs": [],
"source": [
"df_merged['birth_number'] = pd.to_datetime(df_merged['birth_number'], format=\"%Y%m%d\", errors='coerce')\n",
"df_merged['birth_number'].head(20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "UnOMNI5w5-jD"
},
"outputs": [],
"source": [
"df_merged['age_at_creation'] = df_merged['account_creation'] - df_merged['birth_number']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "DFt1MP-6E7cI"
},
"outputs": [],
"source": [
"df_merged['age_at_creation'] = df_merged['age_at_creation'] / np.timedelta64(1,'Y')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 309
},
"colab_type": "code",
"executionInfo": {
"elapsed": 11751,
"status": "ok",
"timestamp": 1574650433833,
"user": {
"displayName": "Anthony So",
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
"userId": "11809607246124237079"
},
"user_tz": -660
},
"id": "a_c3aRiiFzga",
"outputId": "13446355-b3af-478f-a143-931f06f91617"
},
"outputs": [],
"source": [
"df_merged['age_at_creation'] = df_merged['age_at_creation'].round()\n",
"df_merged.head()"
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "Activity12_1.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.6"
}
},
"nbformat": 4,
"nbformat_minor": 1
}