mirror of
https://github.com/fenago/data-science.git
synced 2026-05-06 01:22:41 +00:00
281 lines
5.4 KiB
Plaintext
281 lines
5.4 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text",
|
|
"id": "1Jea7edSZEQt"
|
|
},
|
|
"source": [
|
|
"# Cross Validation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "LlEvES3LZEQu"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# import libraries\n",
|
|
"import pandas as pd\n",
|
|
"from sklearn.model_selection import train_test_split"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 266
|
|
},
|
|
"colab_type": "code",
|
|
"id": "mKjSA1nyZEQ1",
|
|
"outputId": "ceea8307-5740-4402-ac68-ae4982ea0897"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# data doesn't have headers, so let's create headers\n",
|
|
"_headers = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'car']\n",
|
|
"# read in cars dataset\n",
|
|
"df = pd.read_csv('../Dataset/car.data', names=_headers, index_col=None)\n",
|
|
"df.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "Q7bSbwgLZEQ6"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"#split the data into 80% for training and 20% for evaluation\n",
|
|
"training_df, eval_df = train_test_split(df, train_size=0.8, random_state=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 194
|
|
},
|
|
"colab_type": "code",
|
|
"id": "15q798w9ZERA",
|
|
"outputId": "87ba948e-8696-4bab-d16d-c77830d51595"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"training_df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 194
|
|
},
|
|
"colab_type": "code",
|
|
"id": "1m6MMB5qZERF",
|
|
"outputId": "5bd155a7-a1d2-4508-c060-3236542c6d4e"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"eval_df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text",
|
|
"id": "JNDU3YyqZERK"
|
|
},
|
|
"source": [
|
|
"## KFold"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "kcfasL9TZERL"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from sklearn.model_selection import KFold"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "aZ0kHogUZERO"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"_kf = KFold(n_splits=5)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "xRNYRhVaZERU"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"indices = _kf.split(df)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 35
|
|
},
|
|
"colab_type": "code",
|
|
"id": "FWwkr65qZERX",
|
|
"outputId": "6a3b8cde-0f08-43c7-d2e3-87ec31cea395"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(type(indices))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "QWCNPEzSZERa"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"#first set\n",
|
|
"train_indices, val_indices = next(indices)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "O_IOUW9aZERd"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_df = df.drop(val_indices)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 266
|
|
},
|
|
"colab_type": "code",
|
|
"id": "0wNV-QYwZERg",
|
|
"outputId": "a482777b-4b99-4bba-9211-cb9093953944"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_df.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "LDQVoPAHZERl"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"val_df = df.drop(train_indices)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 266
|
|
},
|
|
"colab_type": "code",
|
|
"id": "X8UvqR28ZERo",
|
|
"outputId": "c6d9e3c6-3936-49d8-bc67-a7d5aeab44f6"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"val_df.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"colab": {},
|
|
"colab_type": "code",
|
|
"id": "Jn14g6h0ZERr"
|
|
},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"name": "Exercise7.03.ipynb",
|
|
"provenance": []
|
|
},
|
|
"file_extension": ".py",
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.6"
|
|
},
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"npconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": 3
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 1
|
|
}
|