mirror of
https://github.com/fenago/data-science.git
synced 2026-05-05 17:11:52 +00:00
274 lines
5.4 KiB
Plaintext
274 lines
5.4 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text",
|
|
"id": "d3tuu-nXbQI_"
|
|
},
|
|
"source": [
|
|
"# 5-Fold Cross Validation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "OZcXS3CMbQJA"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# import libraries\n",
|
|
"import pandas as pd\n",
|
|
"from sklearn.model_selection import train_test_split"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 266
|
|
},
|
|
"colab_type": "code",
|
|
"id": "E_pUA2KBbQJJ",
|
|
"outputId": "4e4aaae8-77e3-4527-a456-2185519445a4"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# data doesn't have headers, so let's create headers\n",
|
|
"_headers = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'car']\n",
|
|
"# read in cars dataset\n",
|
|
"df = pd.read_csv('../Dataset/car.data', names=_headers, index_col=None)\n",
|
|
"df.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "l0vTXFTxbQJR"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"#split the data into 80% for training and 20% for evaluation\n",
|
|
"training_df, eval_df = train_test_split(df, train_size=0.8, random_state=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 194
|
|
},
|
|
"colab_type": "code",
|
|
"id": "IoLYxyNIbQJX",
|
|
"outputId": "7fb4eed4-cba5-493d-c901-947895b41e8c"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"training_df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 194
|
|
},
|
|
"colab_type": "code",
|
|
"id": "IyoSMML6bQJd",
|
|
"outputId": "02d748bf-8d12-4352-dc57-6152b855b763"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"eval_df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text",
|
|
"id": "B5kBfeZZbQJk"
|
|
},
|
|
"source": [
|
|
"## KFold"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "KlKnqGdebQJl"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from sklearn.model_selection import KFold"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "8Q0HLrEjbQJq"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"n_splits = 5"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "XVr1kaHPbQJw"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"#create an instance of KFold\n",
|
|
"_kf = KFold(n_splits=n_splits)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "3T1UfHi5bQJz"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"#create splits as _indices\n",
|
|
"_indices = _kf.split(df)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "Bx52vDyMbQJ5"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# create lists to hold training and validation DataFrames\n",
|
|
"_t, _v = [], []"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "WmyBGmL8bQJ8"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"#iterate over _indices\n",
|
|
"for i in range(n_splits):\n",
|
|
" train_idx, val_idx = next(_indices)\n",
|
|
" _train_df = df.drop(val_idx)\n",
|
|
" _t.append(_train_df)\n",
|
|
" _val_df = df.drop(train_idx)\n",
|
|
" _v.append(_val_df)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 1000
|
|
},
|
|
"colab_type": "code",
|
|
"id": "zdhxalisbQKA",
|
|
"outputId": "0a58a9dd-0e3c-4d8a-9133-e9587c89a3f1"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"for d in _t:\n",
|
|
" print(d.info())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 1000
|
|
},
|
|
"colab_type": "code",
|
|
"id": "zlisAq67bQKE",
|
|
"outputId": "65aaac45-3721-46c0-eb88-6fc6f8662d64"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"for d in _v:\n",
|
|
" print(d.info())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "iQK3oFtDbQKK"
|
|
},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"name": "Exercise7.04.ipynb",
|
|
"provenance": []
|
|
},
|
|
"file_extension": ".py",
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.6"
|
|
},
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"npconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": 3
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 1
|
|
}
|