{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#began at 1:40pm\n", "df = pd.read_csv('german_credit_data.csv', delimiter= ',')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 1000 entries, 0 to 999\n", "Data columns (total 21 columns):\n", "Check_Account 1000 non-null object\n", "Duration 1000 non-null int64\n", "Credit_history 1000 non-null object\n", "Purpose 1000 non-null object\n", "Credit amount 1000 non-null int64\n", "Saving_account 1000 non-null object\n", "Employment 1000 non-null object\n", "Install_rate 1000 non-null int64\n", "Personal_status 1000 non-null object\n", "Other_debrotors 1000 non-null object\n", "Present_residence 1000 non-null int64\n", "Property 1000 non-null object\n", "Age 1000 non-null int64\n", "Installment_plant 1000 non-null object\n", "Housing 1000 non-null object\n", "Num_credits 1000 non-null int64\n", "Job 1000 non-null object\n", "Num_dependents 1000 non-null int64\n", "Telephone 1000 non-null object\n", "Foreign 1000 non-null object\n", "Result 1000 non-null int64\n", "dtypes: int64(8), object(13)\n", "memory usage: 164.2+ KB\n" ] } ], "source": [ "#non-null check\n", "df.info()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1000, 21)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DurationCredit amountInstall_ratePresent_residenceAgeNum_creditsNum_dependentsResult
count1000.0000001000.0000001000.0000001000.0000001000.0000001000.0000001000.0000001000.000000
mean20.9030003271.2580002.9730002.84500035.5460001.4070001.1550001.300000
std12.0588142822.7368761.1187151.10371811.3754690.5776540.3620860.458487
min4.000000250.0000001.0000001.00000019.0000001.0000001.0000001.000000
25%12.0000001365.5000002.0000002.00000027.0000001.0000001.0000001.000000
50%18.0000002319.5000003.0000003.00000033.0000001.0000001.0000001.000000
75%24.0000003972.2500004.0000004.00000042.0000002.0000001.0000002.000000
max72.00000018424.0000004.0000004.00000075.0000004.0000002.0000002.000000
\n", "
" ], "text/plain": [ " Duration Credit amount Install_rate Present_residence \\\n", "count 1000.000000 1000.000000 1000.000000 1000.000000 \n", "mean 20.903000 3271.258000 2.973000 2.845000 \n", "std 12.058814 2822.736876 1.118715 1.103718 \n", "min 4.000000 250.000000 1.000000 1.000000 \n", "25% 12.000000 1365.500000 2.000000 2.000000 \n", "50% 18.000000 2319.500000 3.000000 3.000000 \n", "75% 24.000000 3972.250000 4.000000 4.000000 \n", "max 72.000000 18424.000000 4.000000 4.000000 \n", "\n", " Age Num_credits Num_dependents Result \n", "count 1000.000000 1000.000000 1000.000000 1000.000000 \n", "mean 35.546000 1.407000 1.155000 1.300000 \n", "std 11.375469 0.577654 0.362086 0.458487 \n", "min 19.000000 1.000000 1.000000 1.000000 \n", "25% 27.000000 1.000000 1.000000 1.000000 \n", "50% 33.000000 1.000000 1.000000 1.000000 \n", "75% 42.000000 2.000000 1.000000 2.000000 \n", "max 75.000000 4.000000 2.000000 2.000000 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#decribe out numerical features\n", "df.describe()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1 700\n", "2 300\n", "Name: Result, dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.Result.value_counts()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "#segment the data\n", "X, y = df.loc[:, df.columns != 'Result'], df.Result" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "train_X, test_X, train_y, test_y = train_test_split(X, y)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "#split into numerical columns and categorical columns\n", "num_columns = [col for col in train_X.columns\n", " if np.issubdtype(train_X.dtypes[col], np.number)]\n", "cat_columns = [col for col in train_X.columns\n", " if col not in num_columns]" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Duration',\n", " 'Credit amount ',\n", " 'Install_rate',\n", " 'Present_residence',\n", " 'Age',\n", " 'Num_credits',\n", " 'Num_dependents']" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "num_columns" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Check_Account ',\n", " 'Credit_history',\n", " 'Purpose',\n", " 'Saving_account',\n", " 'Employment',\n", " 'Personal_status',\n", " 'Other_debrotors',\n", " 'Property',\n", " 'Installment_plant',\n", " 'Housing',\n", " 'Job',\n", " 'Telephone',\n", " 'Foreign']" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cat_columns" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "#model for arbitrary results\n", "import lale\n", "from sklearn.preprocessing import Normalizer as Norm\n", "from lale.lib.lale import NoOp\n", "from sklearn.preprocessing import OneHotEncoder as OneHot\n", "from sklearn.linear_model import LogisticRegression as LR\n", "from xgboost import XGBClassifier as XGBoost\n", "from sklearn.ensemble import RandomForestClassifier as RFC\n", "from sklearn.svm import LinearSVC\n", "from sklearn.compose import ColumnTransformer\n", "from lale.operators import make_pipeline\n", "lale.wrap_imported_operators()" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "\n", "cluster:column_transformer\n", "\n", "\n", "ColumnTransformer\n", "\n", "\n", "\n", "\n", "\n", "norm\n", "\n", "\n", "Norm\n", "\n", "\n", "\n", "\n", "\n", "lr\n", "\n", "\n", "LR\n", "\n", "\n", "\n", "\n", "\n", "norm->lr\n", "\n", "\n", "\n", "\n", "\n", "one_hot\n", "\n", "\n", "One-\n", "Hot\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pipeline_trainable = ColumnTransformer(\n", " transformers=[\n", " ('num_tfm', Norm(norm='l1'), num_columns),\n", " ('cat_tfm', OneHot(), cat_columns)]) >> LR()\n", "pipeline_trainable.visualize()" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 400 ms, sys: 7.8 ms, total: 408 ms\n", "Wall time: 415 ms\n" ] } ], "source": [ "%%time\n", "pipeline_trained = pipeline_trainable.fit(train_X, train_y)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "accuracy 76.4%\n" ] } ], "source": [ "import sklearn.metrics\n", "predictions = pipeline_trained.predict(test_X)\n", "print(f'accuracy {sklearn.metrics.accuracy_score(test_y, predictions):.1%}')" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "\n", "cluster:choice_0\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "\n", "cluster:column_transformer\n", "\n", "\n", "ColumnTransformer\n", "\n", "\n", "\n", "\n", "cluster:choice\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "\n", "\n", "norm\n", "\n", "\n", "Norm\n", "\n", "\n", "\n", "\n", "\n", "lr\n", "\n", "\n", "LR\n", "\n", "\n", "\n", "\n", "\n", "norm->lr\n", "\n", "\n", "\n", "\n", "\n", "no_op\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "\n", "one_hot\n", "\n", "\n", "One-\n", "Hot\n", "\n", "\n", "\n", "\n", "\n", "linear_svc\n", "\n", "\n", "Linear-\n", "SVC\n", "\n", "\n", "\n", "\n", "\n", "xg_boost\n", "\n", "\n", "XG-\n", "Boost\n", "\n", "\n", "\n", "\n", "\n", "rfc\n", "\n", "\n", "RFC\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#lets get better results \n", "pipeline_planned = make_pipeline(\n", " ColumnTransformer(transformers=[\n", " ('num_tfm', Norm | NoOp, num_columns),\n", " ('cat_tfm', OneHot, cat_columns)]), \n", " LR | LinearSVC(dual=False)| XGBoost | RFC)\n", "pipeline_planned.visualize()" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[14:12:56] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n", "Parameters: { colsample_bylevel, colsample_bynode, colsample_bytree, gamma, max_delta_step, max_depth, min_child_weight, subsample, tree_method } might not be used.\n", "\n", " This may not be accurate due to some parameters are only used in language bindings but\n", " passed down to XGBoost core. Or some parameters are not used but slip through this\n", " verification. Please open an issue if you find above cases.\n", "\n", "\n", "[14:12:57] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n", "Parameters: { colsample_bylevel, colsample_bynode, colsample_bytree, gamma, max_delta_step, max_depth, min_child_weight, subsample, tree_method } might not be used.\n", "\n", " This may not be accurate due to some parameters are only used in language bindings but\n", " passed down to XGBoost core. Or some parameters are not used but slip through this\n", " verification. Please open an issue if you find above cases.\n", "\n", "\n", "[14:12:57] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n", "Parameters: { colsample_bylevel, colsample_bynode, colsample_bytree, gamma, max_delta_step, max_depth, min_child_weight, subsample, tree_method } might not be used.\n", "\n", " This may not be accurate due to some parameters are only used in language bindings but\n", " passed down to XGBoost core. Or some parameters are not used but slip through this\n", " verification. Please open an issue if you find above cases.\n", "\n", "\n", "100%|██████████| 5/5 [00:47<00:00, 9.58s/trial, best loss: -0.76894708802658]\n" ] } ], "source": [ "from lale.lib.lale import Hyperopt\n", "pipe_t = pipeline_planned.auto_configure(train_X, train_y, Hyperopt, cv=3, max_evals=5, scoring = 'roc_auc')" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "\n", "cluster:column_transformer\n", "\n", "\n", "ColumnTransformer\n", "\n", "\n", "\n", "\n", "\n", "norm\n", "\n", "\n", "Norm\n", "\n", "\n", "\n", "\n", "\n", "lr\n", "\n", "\n", "LR\n", "\n", "\n", "\n", "\n", "\n", "norm->lr\n", "\n", "\n", "\n", "\n", "\n", "one_hot\n", "\n", "\n", "One-\n", "Hot\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "```python\n", "norm = Norm(norm=\"l1\")\n", "column_transformer = ColumnTransformer(\n", " transformers=[\n", " (\n", " \"num_tfm\",\n", " norm,\n", " [\n", " \"Duration\",\n", " \"Credit amount \",\n", " \"Install_rate\",\n", " \"Present_residence\",\n", " \"Age\",\n", " \"Num_credits\",\n", " \"Num_dependents\",\n", " ],\n", " ),\n", " (\n", " \"cat_tfm\",\n", " OneHot(),\n", " [\n", " \"Check_Account \",\n", " \"Credit_history\",\n", " \"Purpose\",\n", " \"Saving_account\",\n", " \"Employment\",\n", " \"Personal_status\",\n", " \"Other_debrotors\",\n", " \"Property\",\n", " \"Installment_plant\",\n", " \"Housing\",\n", " \"Job\",\n", " \"Telephone\",\n", " \"Foreign\",\n", " ],\n", " ),\n", " ]\n", ")\n", "pipeline = column_transformer >> LR()\n", "```" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pipeline_trained.visualize()\n", "pipeline_trained.pretty_print(ipython_display=True, show_imports=False)" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "accuracy 76.4%\n", "roc_auc 68.0%\n" ] } ], "source": [ "#not quite where we want to be so perhaps we can fine-tune more\n", "predictions = pipeline_trained.predict(test_X)\n", "print(f'accuracy {sklearn.metrics.accuracy_score(test_y, predictions):.1%}')\n", "print(f'roc_auc {sklearn.metrics.roc_auc_score(test_y, predictions):.1%}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }