mirror of
https://github.com/fenago/data-science.git
synced 2026-05-04 08:31:59 +00:00
487 lines
11 KiB
Plaintext
487 lines
11 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "_b0A-ElAnHj2"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import joblib\n",
|
|
"from sklearn.ensemble import RandomForestClassifier"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "xuyNn2VlnHm9"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"file_url = '../Dataset/breast-cancer-wisconsin.data'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "MSH55O2Qn3nn"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"col_names = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size',\n",
|
|
"'Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "qHM7W8jTnHye"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = pd.read_csv(file_url, header=None, names=col_names, na_values='?')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "ZOV3ihu0dnnG"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"y = df.pop('Class')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "hnADQ-gGfGk4"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.drop('Sample code number', axis=1, inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 34
|
|
},
|
|
"colab_type": "code",
|
|
"executionInfo": {
|
|
"elapsed": 7414,
|
|
"status": "ok",
|
|
"timestamp": 1574652486313,
|
|
"user": {
|
|
"displayName": "Anthony So",
|
|
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
|
|
"userId": "11809607246124237079"
|
|
},
|
|
"user_tz": -660
|
|
},
|
|
"id": "mtp9sRXbdnpx",
|
|
"outputId": "b52bbbc5-c6c2-41eb-df8b-75e01d4af5cf"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"training_rows = int(df.shape[0] * 0.7)\n",
|
|
"training_rows"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "5QiCAi94dnsE"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"X_train = df[:training_rows]\n",
|
|
"y_train = y[:training_rows]\n",
|
|
"X_test = df[training_rows:]\n",
|
|
"y_test = y[training_rows:]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 187
|
|
},
|
|
"colab_type": "code",
|
|
"executionInfo": {
|
|
"elapsed": 7406,
|
|
"status": "ok",
|
|
"timestamp": 1574652486314,
|
|
"user": {
|
|
"displayName": "Anthony So",
|
|
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
|
|
"userId": "11809607246124237079"
|
|
},
|
|
"user_tz": -660
|
|
},
|
|
"id": "APxUiwuLdnzH",
|
|
"outputId": "0748da40-ea30-495a-efea-90c8746f8dac"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"X_train.isna().sum()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 170
|
|
},
|
|
"colab_type": "code",
|
|
"executionInfo": {
|
|
"elapsed": 7397,
|
|
"status": "ok",
|
|
"timestamp": 1574652486315,
|
|
"user": {
|
|
"displayName": "Anthony So",
|
|
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
|
|
"userId": "11809607246124237079"
|
|
},
|
|
"user_tz": -660
|
|
},
|
|
"id": "ChSTYAQHdn16",
|
|
"outputId": "1fc7d4ca-42eb-4a4b-f06c-af5b69cc996f"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"num_columns = [col for col in X_train.columns if X_train[col].dtype != 'object']\n",
|
|
"num_columns"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 170
|
|
},
|
|
"colab_type": "code",
|
|
"executionInfo": {
|
|
"elapsed": 7393,
|
|
"status": "ok",
|
|
"timestamp": 1574652486317,
|
|
"user": {
|
|
"displayName": "Anthony So",
|
|
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
|
|
"userId": "11809607246124237079"
|
|
},
|
|
"user_tz": -660
|
|
},
|
|
"id": "C3rO4euNekF3",
|
|
"outputId": "e0611fd0-85d2-4556-9dcf-ad87afbcd831"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"column_mean = {}\n",
|
|
"for col in num_columns:\n",
|
|
" column_mean[col] = X_train[col].mean()\n",
|
|
"column_mean"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "XNPAWdnNe_ug"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pickle\n",
|
|
"pickle.dump(column_mean, open(\"columns_mean.pkl\", \"wb\" ) )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 102
|
|
},
|
|
"colab_type": "code",
|
|
"executionInfo": {
|
|
"elapsed": 7385,
|
|
"status": "ok",
|
|
"timestamp": 1574652486318,
|
|
"user": {
|
|
"displayName": "Anthony So",
|
|
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
|
|
"userId": "11809607246124237079"
|
|
},
|
|
"user_tz": -660
|
|
},
|
|
"id": "V1vXPXzie_rt",
|
|
"outputId": "f8e0ee5c-87d2-4296-e222-b9cd2c597c7f"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"for col in num_columns:\n",
|
|
" X_train[col].fillna(column_mean[col], inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 88
|
|
},
|
|
"colab_type": "code",
|
|
"executionInfo": {
|
|
"elapsed": 7381,
|
|
"status": "ok",
|
|
"timestamp": 1574652486319,
|
|
"user": {
|
|
"displayName": "Anthony So",
|
|
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
|
|
"userId": "11809607246124237079"
|
|
},
|
|
"user_tz": -660
|
|
},
|
|
"id": "QkFwEdz-ficr",
|
|
"outputId": "d9224d81-9712-4447-af82-c85023a05c1f"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"rf_model = RandomForestClassifier(random_state=1)\n",
|
|
"rf_model.fit(X_train, y_train)\n",
|
|
"joblib.dump(rf_model, \"model.pkl\") "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "7E2YdRNef1eP"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import socket\n",
|
|
"import threading\n",
|
|
"import requests\n",
|
|
"import json\n",
|
|
"from flask import Flask, jsonify, request\n",
|
|
"import numpy as np"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "eo2FtaYSf1kw"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"app = Flask(__name__)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "ketUKpP0f1nE"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"trained_model = joblib.load(\"model.pkl\")\n",
|
|
"var_means = pickle.load(open(\"columns_mean.pkl\", \"rb\" ) )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "hYSIZLGxf1pb"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"@app.route('/api', methods=['POST'])\n",
|
|
"def predict():\n",
|
|
" data = request.get_json()\n",
|
|
" df_test = pd.DataFrame(data, index=[0])\n",
|
|
" for col, avg_value in var_means.items():\n",
|
|
" df_test[col].fillna(avg_value, inplace=True)\n",
|
|
" prediction = trained_model.predict(df_test)\n",
|
|
" str_pred = np.array2string(prediction)\n",
|
|
" return jsonify(str_pred)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 34
|
|
},
|
|
"colab_type": "code",
|
|
"executionInfo": {
|
|
"elapsed": 7703,
|
|
"status": "ok",
|
|
"timestamp": 1574652486658,
|
|
"user": {
|
|
"displayName": "Anthony So",
|
|
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
|
|
"userId": "11809607246124237079"
|
|
},
|
|
"user_tz": -660
|
|
},
|
|
"id": "d-8fz7aFf1jm",
|
|
"outputId": "3faa1f57-9d89-4146-f8ed-e3d259645a26"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"flask_thread = threading.Thread(target=app.run, kwargs={'host':'0.0.0.0','port':8080})\n",
|
|
"flask_thread.start()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 139
|
|
},
|
|
"colab_type": "code",
|
|
"executionInfo": {
|
|
"elapsed": 7699,
|
|
"status": "ok",
|
|
"timestamp": 1574652486660,
|
|
"user": {
|
|
"displayName": "Anthony So",
|
|
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
|
|
"userId": "11809607246124237079"
|
|
},
|
|
"user_tz": -660
|
|
},
|
|
"id": "AF4eiCKugmZ1",
|
|
"outputId": "eabdc4e6-82a0-45ae-a081-e6738a409794"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"record = X_test[X_test['Bare Nuclei'].isna()].iloc[0].to_json()\n",
|
|
"record"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "-GLgAVCTf1g3"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8'}\n",
|
|
"ip_address = socket.gethostbyname(socket.gethostname()) + ':8080' \n",
|
|
"\n",
|
|
"ip_address"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 51
|
|
},
|
|
"colab_type": "code",
|
|
"executionInfo": {
|
|
"elapsed": 7692,
|
|
"status": "ok",
|
|
"timestamp": 1574652486661,
|
|
"user": {
|
|
"displayName": "Anthony So",
|
|
"photoUrl": "https://lh3.googleusercontent.com/a-/AAuE7mCYY-iGjUIqBSnlLoszfZTN7rU7FRNg05Rdt9Ii3A=s64",
|
|
"userId": "11809607246124237079"
|
|
},
|
|
"user_tz": -660
|
|
},
|
|
"id": "DR7wNIjkgmfa",
|
|
"outputId": "7bb6a95a-9bd7-48f9-fcaf-0fb82dffbd56"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"r = requests.post(f\"http://{ip_address}/api\", data=record, headers=headers)\n",
|
|
"r.text"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"collapsed_sections": [],
|
|
"name": "Exercise18_03.ipynb",
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 1
|
|
}
|