mirror of
https://github.com/fenago/data-science.git
synced 2026-05-05 17:11:52 +00:00
466 lines
10 KiB
Plaintext
466 lines
10 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "_b0A-ElAnHj2"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import joblib\n",
|
|
"import pickle\n",
|
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
|
"from sklearn.model_selection import train_test_split"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "xuyNn2VlnHm9"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"file_url = '../Dataset/phpMawTba.csv'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "qHM7W8jTnHye"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = pd.read_csv(file_url)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 306
|
|
},
|
|
"colab_type": "code",
|
|
"executionInfo": {
|
|
"elapsed": 7095,
|
|
"status": "ok",
|
|
"timestamp": 1574651007745,
|
|
"user": {
|
|
"displayName": "Anthony So",
|
|
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
|
|
"userId": "11809607246124237079"
|
|
},
|
|
"user_tz": -660
|
|
},
|
|
"id": "kHqvPP4V7d-G",
|
|
"outputId": "364f99a2-0438-4bc8-eb36-ee3e154a5b31"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "ZOV3ihu0dnnG"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"y = df.pop('class')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 153
|
|
},
|
|
"colab_type": "code",
|
|
"executionInfo": {
|
|
"elapsed": 7087,
|
|
"status": "ok",
|
|
"timestamp": 1574651007746,
|
|
"user": {
|
|
"displayName": "Anthony So",
|
|
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
|
|
"userId": "11809607246124237079"
|
|
},
|
|
"user_tz": -660
|
|
},
|
|
"id": "I4X1JPfqMnnL",
|
|
"outputId": "7f5a1b1a-fc11-4f76-fc47-64adf337989a"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"cat_columns = [col for col in df.columns if df[col].dtype == 'object']\n",
|
|
"cat_columns"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "mtp9sRXbdnpx"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.33, random_state=8)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "G_rR0K9yOasg"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"column_categories = {}"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "5QiCAi94dnsE"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"for col in cat_columns:\n",
|
|
" column_categories[col] = X_train[col].astype('category').cat.categories"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "w_JekLBEPeGg"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"pickle.dump(column_categories, open(\"categories_data.pkl\", \"wb\"))\n",
|
|
"pickle.dump(cat_columns, open(\"categorical_columns.pkl\", \"wb\"))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "otsp5ZZ_Pp42"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def apply_categories(input_df, cat_dict):\n",
|
|
" from pandas.api.types import CategoricalDtype\n",
|
|
"\n",
|
|
" for col, cat in cat_dict.items():\n",
|
|
" input_df[col] = input_df[col].astype(CategoricalDtype(categories=cat))\n",
|
|
"\n",
|
|
" return input_df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 374
|
|
},
|
|
"colab_type": "code",
|
|
"executionInfo": {
|
|
"elapsed": 7040,
|
|
"status": "ok",
|
|
"timestamp": 1574651007749,
|
|
"user": {
|
|
"displayName": "Anthony So",
|
|
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
|
|
"userId": "11809607246124237079"
|
|
},
|
|
"user_tz": -660
|
|
},
|
|
"id": "NIsFsowDWGfo",
|
|
"outputId": "7b8112e0-6299-407d-9530-a6801e304fcc"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"X_train_cat = apply_categories(X_train, column_categories)\n",
|
|
"X_train_cat.dtypes"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "Te17ukrSXD7F"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"X_train_final = pd.get_dummies(X_train_cat, columns=cat_columns)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 88
|
|
},
|
|
"colab_type": "code",
|
|
"executionInfo": {
|
|
"elapsed": 8262,
|
|
"status": "ok",
|
|
"timestamp": 1574651008984,
|
|
"user": {
|
|
"displayName": "Anthony So",
|
|
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
|
|
"userId": "11809607246124237079"
|
|
},
|
|
"user_tz": -660
|
|
},
|
|
"id": "QkFwEdz-ficr",
|
|
"outputId": "e1b6b1e6-40af-4601-b3c1-f35e8d8eb5cd"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"rf_model = RandomForestClassifier(random_state=8)\n",
|
|
"rf_model.fit(X_train_final, y_train)\n",
|
|
"joblib.dump(rf_model, \"model.pkl\") "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "7E2YdRNef1eP"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import socket\n",
|
|
"import threading\n",
|
|
"import requests\n",
|
|
"import json\n",
|
|
"from flask import Flask, jsonify, request\n",
|
|
"import numpy as np"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "eo2FtaYSf1kw"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"app = Flask(__name__)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "ketUKpP0f1nE"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"trained_model = joblib.load(\"model.pkl\")\n",
|
|
"var_means = pickle.load(open(\"categories_data.pkl\", \"rb\"))\n",
|
|
"cat_cols = pickle.load(open(\"categorical_columns.pkl\", \"rb\"))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "hYSIZLGxf1pb"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"@app.route('/api', methods=['POST'])\n",
|
|
"def predict():\n",
|
|
" data = request.get_json()\n",
|
|
" df_test = pd.DataFrame(data, index=[0])\n",
|
|
" df_test_clean = apply_categories(df_test, var_means)\n",
|
|
" df_test_final = pd.get_dummies(df_test_clean, columns=cat_cols)\n",
|
|
" prediction = trained_model.predict(df_test_final)\n",
|
|
" str_pred = np.array2string(prediction)\n",
|
|
" return jsonify(str_pred)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 34
|
|
},
|
|
"colab_type": "code",
|
|
"executionInfo": {
|
|
"elapsed": 8532,
|
|
"status": "ok",
|
|
"timestamp": 1574651009283,
|
|
"user": {
|
|
"displayName": "Anthony So",
|
|
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
|
|
"userId": "11809607246124237079"
|
|
},
|
|
"user_tz": -660
|
|
},
|
|
"id": "d-8fz7aFf1jm",
|
|
"outputId": "b7e13203-ef10-43ef-d1ce-8d961d7d8a5e"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"flask_thread = threading.Thread(target=app.run, kwargs={'host':'0.0.0.0','port':8080})\n",
|
|
"flask_thread.start()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 139
|
|
},
|
|
"colab_type": "code",
|
|
"executionInfo": {
|
|
"elapsed": 8525,
|
|
"status": "ok",
|
|
"timestamp": 1574651009284,
|
|
"user": {
|
|
"displayName": "Anthony So",
|
|
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
|
|
"userId": "11809607246124237079"
|
|
},
|
|
"user_tz": -660
|
|
},
|
|
"id": "AF4eiCKugmZ1",
|
|
"outputId": "99f7ed83-836b-4e18-c270-6bd690c7efbc"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"record = X_test.iloc[0,].to_json()\n",
|
|
"record"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "-GLgAVCTf1g3"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8'}\n",
|
|
"ip_address = socket.gethostbyname(socket.gethostname()) + ':8080'\n",
|
|
"\n",
|
|
"ip_address"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 51
|
|
},
|
|
"colab_type": "code",
|
|
"executionInfo": {
|
|
"elapsed": 8514,
|
|
"status": "ok",
|
|
"timestamp": 1574651009285,
|
|
"user": {
|
|
"displayName": "Anthony So",
|
|
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
|
|
"userId": "11809607246124237079"
|
|
},
|
|
"user_tz": -660
|
|
},
|
|
"id": "DR7wNIjkgmfa",
|
|
"outputId": "722f02c2-203e-47a3-c57f-0658159113f0"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"r = requests.post(f\"http://{ip_address}/api\", data=record, headers=headers)\n",
|
|
"r.text"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"collapsed_sections": [],
|
|
"name": "Activity18_1.ipynb",
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 1
|
|
}
|