{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#began at 1:40pm\n",
    "df = pd.read_csv('german_credit_data.csv', delimiter= ',')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1000 entries, 0 to 999\n",
      "Data columns (total 21 columns):\n",
      "Check_Account        1000 non-null object\n",
      "Duration             1000 non-null int64\n",
      "Credit_history       1000 non-null object\n",
      "Purpose              1000 non-null object\n",
      "Credit amount        1000 non-null int64\n",
      "Saving_account       1000 non-null object\n",
      "Employment           1000 non-null object\n",
      "Install_rate         1000 non-null int64\n",
      "Personal_status      1000 non-null object\n",
      "Other_debrotors      1000 non-null object\n",
      "Present_residence    1000 non-null int64\n",
      "Property             1000 non-null object\n",
      "Age                  1000 non-null int64\n",
      "Installment_plant    1000 non-null object\n",
      "Housing              1000 non-null object\n",
      "Num_credits          1000 non-null int64\n",
      "Job                  1000 non-null object\n",
      "Num_dependents       1000 non-null int64\n",
      "Telephone            1000 non-null object\n",
      "Foreign              1000 non-null object\n",
      "Result               1000 non-null int64\n",
      "dtypes: int64(8), object(13)\n",
      "memory usage: 164.2+ KB\n"
     ]
    }
   ],
   "source": [
    "#non-null check\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1000, 21)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Duration</th>\n",
       "      <th>Credit amount</th>\n",
       "      <th>Install_rate</th>\n",
       "      <th>Present_residence</th>\n",
       "      <th>Age</th>\n",
       "      <th>Num_credits</th>\n",
       "      <th>Num_dependents</th>\n",
       "      <th>Result</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>20.903000</td>\n",
       "      <td>3271.258000</td>\n",
       "      <td>2.973000</td>\n",
       "      <td>2.845000</td>\n",
       "      <td>35.546000</td>\n",
       "      <td>1.407000</td>\n",
       "      <td>1.155000</td>\n",
       "      <td>1.300000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>12.058814</td>\n",
       "      <td>2822.736876</td>\n",
       "      <td>1.118715</td>\n",
       "      <td>1.103718</td>\n",
       "      <td>11.375469</td>\n",
       "      <td>0.577654</td>\n",
       "      <td>0.362086</td>\n",
       "      <td>0.458487</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>4.000000</td>\n",
       "      <td>250.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>19.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>12.000000</td>\n",
       "      <td>1365.500000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>27.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>18.000000</td>\n",
       "      <td>2319.500000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>33.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>24.000000</td>\n",
       "      <td>3972.250000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>42.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>2.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>72.000000</td>\n",
       "      <td>18424.000000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>75.000000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>2.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          Duration  Credit amount   Install_rate  Present_residence  \\\n",
       "count  1000.000000     1000.000000   1000.000000        1000.000000   \n",
       "mean     20.903000     3271.258000      2.973000           2.845000   \n",
       "std      12.058814     2822.736876      1.118715           1.103718   \n",
       "min       4.000000      250.000000      1.000000           1.000000   \n",
       "25%      12.000000     1365.500000      2.000000           2.000000   \n",
       "50%      18.000000     2319.500000      3.000000           3.000000   \n",
       "75%      24.000000     3972.250000      4.000000           4.000000   \n",
       "max      72.000000    18424.000000      4.000000           4.000000   \n",
       "\n",
       "               Age  Num_credits  Num_dependents       Result  \n",
       "count  1000.000000  1000.000000     1000.000000  1000.000000  \n",
       "mean     35.546000     1.407000        1.155000     1.300000  \n",
       "std      11.375469     0.577654        0.362086     0.458487  \n",
       "min      19.000000     1.000000        1.000000     1.000000  \n",
       "25%      27.000000     1.000000        1.000000     1.000000  \n",
       "50%      33.000000     1.000000        1.000000     1.000000  \n",
       "75%      42.000000     2.000000        1.000000     2.000000  \n",
       "max      75.000000     4.000000        2.000000     2.000000  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#decribe out numerical features\n",
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    700\n",
       "2    300\n",
       "Name: Result, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.Result.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "#segment the data\n",
    "X, y = df.loc[:, df.columns != 'Result'], df.Result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_X, test_X, train_y, test_y = train_test_split(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "#split into numerical columns and categorical columns\n",
    "num_columns = [col for col in train_X.columns\n",
    "            if np.issubdtype(train_X.dtypes[col], np.number)]\n",
    "cat_columns = [col for col in train_X.columns\n",
    "            if col not in num_columns]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Duration',\n",
       " 'Credit amount ',\n",
       " 'Install_rate',\n",
       " 'Present_residence',\n",
       " 'Age',\n",
       " 'Num_credits',\n",
       " 'Num_dependents']"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num_columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Check_Account ',\n",
       " 'Credit_history',\n",
       " 'Purpose',\n",
       " 'Saving_account',\n",
       " 'Employment',\n",
       " 'Personal_status',\n",
       " 'Other_debrotors',\n",
       " 'Property',\n",
       " 'Installment_plant',\n",
       " 'Housing',\n",
       " 'Job',\n",
       " 'Telephone',\n",
       " 'Foreign']"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "#model for arbitrary results\n",
    "import lale\n",
    "from sklearn.preprocessing import Normalizer as Norm\n",
    "from lale.lib.lale import NoOp\n",
    "from sklearn.preprocessing import OneHotEncoder as OneHot\n",
    "from sklearn.linear_model import LogisticRegression as LR\n",
    "from xgboost import XGBClassifier as XGBoost\n",
    "from sklearn.ensemble import RandomForestClassifier as RFC\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from lale.operators import make_pipeline\n",
    "lale.wrap_imported_operators()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
       "<!-- Generated by graphviz version 2.43.0 (0)\n",
       " -->\n",
       "<!-- Title: cluster:(root) Pages: 1 -->\n",
       "<svg width=\"220pt\" height=\"141pt\"\n",
       " viewBox=\"0.00 0.00 220.00 141.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 137)\">\n",
       "<title>cluster:(root)</title>\n",
       "<g id=\"a_graph0\"><a xlink:title=\"(root) = ...\">\n",
       "<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-137 216,-137 216,4 -4,4\"/>\n",
       "</a>\n",
       "</g>\n",
       "<g id=\"clust1\" class=\"cluster\">\n",
       "<title>cluster:column_transformer</title>\n",
       "<g id=\"a_clust1\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.column_transformer.html\" xlink:title=\"column_transformer = ColumnTransformer(transformers=[(&#39;num_tfm&#39;, norm, [&#39;Duration&#39;, &#39;Credit amount &#39;, &#39;Install_rate&#39;, &#39;Present_residence&#39;, &#39;Age&#39;, &#39;Num_credits&#39;, &#39;Num_dependents&#39;]), (&#39;cat_tfm&#39;, one_hot, [&#39;Check_Account &#39;, &#39;Credit_history&#39;, &#39;Purpose&#39;, &#39;Saving_account&#39;, &#39;Employment&#39;, &#39;Personal...)\">\n",
       "<polygon fill=\"#b0e2ff\" stroke=\"black\" points=\"0,-8 0,-125 130,-125 130,-8 0,-8\"/>\n",
       "<text text-anchor=\"middle\" x=\"65\" y=\"-109.8\" font-family=\"Times,serif\" font-size=\"14.00\">ColumnTransformer</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- norm -->\n",
       "<g id=\"node1\" class=\"node\">\n",
       "<title>norm</title>\n",
       "<g id=\"a_node1\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.normalizer.html\" xlink:title=\"norm = Norm(norm=&#39;l1&#39;)\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"65\" cy=\"-77\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"65\" y=\"-73.7\" font-family=\"Times,serif\" font-size=\"11.00\">Norm</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- lr -->\n",
       "<g id=\"node3\" class=\"node\">\n",
       "<title>lr</title>\n",
       "<g id=\"a_node3\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.logistic_regression.html\" xlink:title=\"lr = LR()\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"185\" cy=\"-77\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"185\" y=\"-73.7\" font-family=\"Times,serif\" font-size=\"11.00\">LR</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- norm&#45;&gt;lr -->\n",
       "<g id=\"edge1\" class=\"edge\">\n",
       "<title>norm&#45;&gt;lr</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M130,-77C136.08,-77 142.08,-77 147.76,-77\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"147.89,-80.5 157.89,-77 147.89,-73.5 147.89,-80.5\"/>\n",
       "</g>\n",
       "<!-- one_hot -->\n",
       "<g id=\"node2\" class=\"node\">\n",
       "<title>one_hot</title>\n",
       "<g id=\"a_node2\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.one_hot_encoder.html\" xlink:title=\"one_hot = OneHot()\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"65\" cy=\"-34\" rx=\"27\" ry=\"18.27\"/>\n",
       "<text text-anchor=\"middle\" x=\"65\" y=\"-36.2\" font-family=\"Times,serif\" font-size=\"11.00\">One&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"65\" y=\"-25.2\" font-family=\"Times,serif\" font-size=\"11.00\">Hot</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "</g>\n",
       "</svg>\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x7fea7519eb90>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "pipeline_trainable = ColumnTransformer(\n",
    "    transformers=[\n",
    "        ('num_tfm', Norm(norm='l1'), num_columns),\n",
    "        ('cat_tfm', OneHot(), cat_columns)]) >> LR()\n",
    "pipeline_trainable.visualize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 400 ms, sys: 7.8 ms, total: 408 ms\n",
      "Wall time: 415 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "pipeline_trained = pipeline_trainable.fit(train_X, train_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy 76.4%\n"
     ]
    }
   ],
   "source": [
    "import sklearn.metrics\n",
    "predictions = pipeline_trained.predict(test_X)\n",
    "print(f'accuracy {sklearn.metrics.accuracy_score(test_y, predictions):.1%}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
       "<!-- Generated by graphviz version 2.43.0 (0)\n",
       " -->\n",
       "<!-- Title: cluster:(root) Pages: 1 -->\n",
       "<svg width=\"231pt\" height=\"258pt\"\n",
       " viewBox=\"0.00 0.00 230.78 258.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 254)\">\n",
       "<title>cluster:(root)</title>\n",
       "<g id=\"a_graph0\"><a xlink:title=\"(root) = ...\">\n",
       "<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-254 226.78,-254 226.78,4 -4,4\"/>\n",
       "</a>\n",
       "</g>\n",
       "<g id=\"clust3\" class=\"cluster\">\n",
       "<title>cluster:choice_0</title>\n",
       "<g id=\"a_clust3\"><a xlink:title=\"choice_0 = lr | linear_svc | xg_boost | rfc\">\n",
       "<polygon fill=\"#7ec0ee\" stroke=\"black\" points=\"142,-8 142,-212 214.78,-212 214.78,-8 142,-8\"/>\n",
       "<text text-anchor=\"middle\" x=\"178.39\" y=\"-196.8\" font-family=\"Times,serif\" font-size=\"14.00\">Choice</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<g id=\"clust1\" class=\"cluster\">\n",
       "<title>cluster:column_transformer</title>\n",
       "<g id=\"a_clust1\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.column_transformer.html\" xlink:title=\"column_transformer = ColumnTransformer(transformers=[(&#39;num_tfm&#39;, choice, [&#39;Duration&#39;, &#39;Credit amount &#39;, &#39;Install_rate&#39;, &#39;Present_residence&#39;, &#39;Age&#39;, &#39;Num_credits&#39;, &#39;Num_dependents&#39;]), (&#39;cat_tfm&#39;, one_hot, [&#39;Check_Account &#39;, &#39;Credit_history&#39;, &#39;Purpose&#39;, &#39;Saving_account&#39;, &#39;Employment&#39;, &#39;Person...)\">\n",
       "<polygon fill=\"#b0e2ff\" stroke=\"black\" points=\"0,-43 0,-242 130,-242 130,-43 0,-43\"/>\n",
       "<text text-anchor=\"middle\" x=\"65\" y=\"-226.8\" font-family=\"Times,serif\" font-size=\"14.00\">ColumnTransformer</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<g id=\"clust2\" class=\"cluster\">\n",
       "<title>cluster:choice</title>\n",
       "<g id=\"a_clust2\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.column_transformer.html\" xlink:title=\"choice = norm | no_op\">\n",
       "<polygon fill=\"#7ec0ee\" stroke=\"black\" points=\"30,-95 30,-212 100,-212 100,-95 30,-95\"/>\n",
       "<text text-anchor=\"middle\" x=\"65\" y=\"-196.8\" font-family=\"Times,serif\" font-size=\"14.00\">Choice</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- norm -->\n",
       "<g id=\"node1\" class=\"node\">\n",
       "<title>norm</title>\n",
       "<g id=\"a_node1\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.normalizer.html\" xlink:title=\"norm = Norm\">\n",
       "<ellipse fill=\"#7ec0ee\" stroke=\"black\" cx=\"65\" cy=\"-164\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"65\" y=\"-160.7\" font-family=\"Times,serif\" font-size=\"11.00\">Norm</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- lr -->\n",
       "<g id=\"node4\" class=\"node\">\n",
       "<title>lr</title>\n",
       "<g id=\"a_node4\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.logistic_regression.html\" xlink:title=\"lr = LR\">\n",
       "<ellipse fill=\"#7ec0ee\" stroke=\"black\" cx=\"178.39\" cy=\"-164\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"178.39\" y=\"-160.7\" font-family=\"Times,serif\" font-size=\"11.00\">LR</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- norm&#45;&gt;lr -->\n",
       "<g id=\"edge1\" class=\"edge\">\n",
       "<title>norm&#45;&gt;lr</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M130,-164C133.73,-164 137.42,-164 140.98,-164\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"132,-167.5 142,-164 132,-160.5 132,-167.5\"/>\n",
       "</g>\n",
       "<!-- no_op -->\n",
       "<g id=\"node2\" class=\"node\">\n",
       "<title>no_op</title>\n",
       "<g id=\"a_node2\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.no_op.html\" xlink:title=\"no_op = NoOp\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"65\" cy=\"-121\" rx=\"27\" ry=\"18.27\"/>\n",
       "<text text-anchor=\"middle\" x=\"65\" y=\"-123.2\" font-family=\"Times,serif\" font-size=\"11.00\">No&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"65\" y=\"-112.2\" font-family=\"Times,serif\" font-size=\"11.00\">Op</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- one_hot -->\n",
       "<g id=\"node3\" class=\"node\">\n",
       "<title>one_hot</title>\n",
       "<g id=\"a_node3\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.one_hot_encoder.html\" xlink:title=\"one_hot = OneHot\">\n",
       "<ellipse fill=\"#7ec0ee\" stroke=\"black\" cx=\"65\" cy=\"-69\" rx=\"27\" ry=\"18.27\"/>\n",
       "<text text-anchor=\"middle\" x=\"65\" y=\"-71.2\" font-family=\"Times,serif\" font-size=\"11.00\">One&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"65\" y=\"-60.2\" font-family=\"Times,serif\" font-size=\"11.00\">Hot</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- linear_svc -->\n",
       "<g id=\"node5\" class=\"node\">\n",
       "<title>linear_svc</title>\n",
       "<g id=\"a_node5\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.linear_svc.html\" xlink:title=\"linear_svc = LinearSVC(dual=False)\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"178.39\" cy=\"-121\" rx=\"28.28\" ry=\"18.27\"/>\n",
       "<text text-anchor=\"middle\" x=\"178.39\" y=\"-123.2\" font-family=\"Times,serif\" font-size=\"11.00\">Linear&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"178.39\" y=\"-112.2\" font-family=\"Times,serif\" font-size=\"11.00\">SVC</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- xg_boost -->\n",
       "<g id=\"node6\" class=\"node\">\n",
       "<title>xg_boost</title>\n",
       "<g id=\"a_node6\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.xgboost.XGBClassifier.html\" xlink:title=\"xg_boost = XGBoost\">\n",
       "<ellipse fill=\"#7ec0ee\" stroke=\"black\" cx=\"178.39\" cy=\"-77\" rx=\"27\" ry=\"18.27\"/>\n",
       "<text text-anchor=\"middle\" x=\"178.39\" y=\"-79.2\" font-family=\"Times,serif\" font-size=\"11.00\">XG&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"178.39\" y=\"-68.2\" font-family=\"Times,serif\" font-size=\"11.00\">Boost</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- rfc -->\n",
       "<g id=\"node7\" class=\"node\">\n",
       "<title>rfc</title>\n",
       "<g id=\"a_node7\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.random_forest_classifier.html\" xlink:title=\"rfc = RFC\">\n",
       "<ellipse fill=\"#7ec0ee\" stroke=\"black\" cx=\"178.39\" cy=\"-34\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"178.39\" y=\"-30.7\" font-family=\"Times,serif\" font-size=\"11.00\">RFC</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "</g>\n",
       "</svg>\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x7fea78befe10>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#lets get better results \n",
    "pipeline_planned = make_pipeline(\n",
    "    ColumnTransformer(transformers=[\n",
    "        ('num_tfm', Norm | NoOp, num_columns),\n",
    "        ('cat_tfm', OneHot, cat_columns)]), \n",
    "    LR | LinearSVC(dual=False)| XGBoost | RFC)\n",
    "pipeline_planned.visualize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[14:12:56] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480:      \n",
      "Parameters: { colsample_bylevel, colsample_bynode, colsample_bytree, gamma, max_delta_step, max_depth, min_child_weight, subsample, tree_method } might not be used.\n",
      "\n",
      "  This may not be accurate due to some parameters are only used in language bindings but\n",
      "  passed down to XGBoost core.  Or some parameters are not used but slip through this\n",
      "  verification. Please open an issue if you find above cases.\n",
      "\n",
      "\n",
      "[14:12:57] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480:      \n",
      "Parameters: { colsample_bylevel, colsample_bynode, colsample_bytree, gamma, max_delta_step, max_depth, min_child_weight, subsample, tree_method } might not be used.\n",
      "\n",
      "  This may not be accurate due to some parameters are only used in language bindings but\n",
      "  passed down to XGBoost core.  Or some parameters are not used but slip through this\n",
      "  verification. Please open an issue if you find above cases.\n",
      "\n",
      "\n",
      "[14:12:57] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480:      \n",
      "Parameters: { colsample_bylevel, colsample_bynode, colsample_bytree, gamma, max_delta_step, max_depth, min_child_weight, subsample, tree_method } might not be used.\n",
      "\n",
      "  This may not be accurate due to some parameters are only used in language bindings but\n",
      "  passed down to XGBoost core.  Or some parameters are not used but slip through this\n",
      "  verification. Please open an issue if you find above cases.\n",
      "\n",
      "\n",
      "100%|██████████| 5/5 [00:47<00:00,  9.58s/trial, best loss: -0.76894708802658]\n"
     ]
    }
   ],
   "source": [
    "from lale.lib.lale import Hyperopt\n",
    "pipe_t = pipeline_planned.auto_configure(train_X, train_y, Hyperopt, cv=3, max_evals=5, scoring = 'roc_auc')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
       "<!-- Generated by graphviz version 2.43.0 (0)\n",
       " -->\n",
       "<!-- Title: cluster:(root) Pages: 1 -->\n",
       "<svg width=\"220pt\" height=\"141pt\"\n",
       " viewBox=\"0.00 0.00 220.00 141.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 137)\">\n",
       "<title>cluster:(root)</title>\n",
       "<g id=\"a_graph0\"><a xlink:title=\"(root) = ...\">\n",
       "<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-137 216,-137 216,4 -4,4\"/>\n",
       "</a>\n",
       "</g>\n",
       "<g id=\"clust1\" class=\"cluster\">\n",
       "<title>cluster:column_transformer</title>\n",
       "<g id=\"a_clust1\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.column_transformer.html\" xlink:title=\"column_transformer = ColumnTransformer(transformers=[(&#39;num_tfm&#39;, norm, [&#39;Duration&#39;, &#39;Credit amount &#39;, &#39;Install_rate&#39;, &#39;Present_residence&#39;, &#39;Age&#39;, &#39;Num_credits&#39;, &#39;Num_dependents&#39;]), (&#39;cat_tfm&#39;, one_hot, [&#39;Check_Account &#39;, &#39;Credit_history&#39;, &#39;Purpose&#39;, &#39;Saving_account&#39;, &#39;Employment&#39;, &#39;Personal...)\">\n",
       "<polygon fill=\"white\" stroke=\"black\" points=\"0,-8 0,-125 130,-125 130,-8 0,-8\"/>\n",
       "<text text-anchor=\"middle\" x=\"65\" y=\"-109.8\" font-family=\"Times,serif\" font-size=\"14.00\">ColumnTransformer</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- norm -->\n",
       "<g id=\"node1\" class=\"node\">\n",
       "<title>norm</title>\n",
       "<g id=\"a_node1\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.normalizer.html\" xlink:title=\"norm = Norm(norm=&#39;l1&#39;)\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"65\" cy=\"-77\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"65\" y=\"-73.7\" font-family=\"Times,serif\" font-size=\"11.00\">Norm</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- lr -->\n",
       "<g id=\"node3\" class=\"node\">\n",
       "<title>lr</title>\n",
       "<g id=\"a_node3\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.logistic_regression.html\" xlink:title=\"lr = LR()\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"185\" cy=\"-77\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"185\" y=\"-73.7\" font-family=\"Times,serif\" font-size=\"11.00\">LR</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- norm&#45;&gt;lr -->\n",
       "<g id=\"edge1\" class=\"edge\">\n",
       "<title>norm&#45;&gt;lr</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M130,-77C136.08,-77 142.08,-77 147.76,-77\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"147.89,-80.5 157.89,-77 147.89,-73.5 147.89,-80.5\"/>\n",
       "</g>\n",
       "<!-- one_hot -->\n",
       "<g id=\"node2\" class=\"node\">\n",
       "<title>one_hot</title>\n",
       "<g id=\"a_node2\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.one_hot_encoder.html\" xlink:title=\"one_hot = OneHot()\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"65\" cy=\"-34\" rx=\"27\" ry=\"18.27\"/>\n",
       "<text text-anchor=\"middle\" x=\"65\" y=\"-36.2\" font-family=\"Times,serif\" font-size=\"11.00\">One&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"65\" y=\"-25.2\" font-family=\"Times,serif\" font-size=\"11.00\">Hot</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "</g>\n",
       "</svg>\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x7fea75e41190>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "```python\n",
       "norm = Norm(norm=\"l1\")\n",
       "column_transformer = ColumnTransformer(\n",
       "    transformers=[\n",
       "        (\n",
       "            \"num_tfm\",\n",
       "            norm,\n",
       "            [\n",
       "                \"Duration\",\n",
       "                \"Credit amount \",\n",
       "                \"Install_rate\",\n",
       "                \"Present_residence\",\n",
       "                \"Age\",\n",
       "                \"Num_credits\",\n",
       "                \"Num_dependents\",\n",
       "            ],\n",
       "        ),\n",
       "        (\n",
       "            \"cat_tfm\",\n",
       "            OneHot(),\n",
       "            [\n",
       "                \"Check_Account \",\n",
       "                \"Credit_history\",\n",
       "                \"Purpose\",\n",
       "                \"Saving_account\",\n",
       "                \"Employment\",\n",
       "                \"Personal_status\",\n",
       "                \"Other_debrotors\",\n",
       "                \"Property\",\n",
       "                \"Installment_plant\",\n",
       "                \"Housing\",\n",
       "                \"Job\",\n",
       "                \"Telephone\",\n",
       "                \"Foreign\",\n",
       "            ],\n",
       "        ),\n",
       "    ]\n",
       ")\n",
       "pipeline = column_transformer >> LR()\n",
       "```"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "pipeline_trained.visualize()\n",
    "pipeline_trained.pretty_print(ipython_display=True, show_imports=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy 76.4%\n",
      "roc_auc 68.0%\n"
     ]
    }
   ],
   "source": [
    "#not quite where we want to be so perhaps we can fine-tune more\n",
    "predictions = pipeline_trained.predict(test_X)\n",
    "print(f'accuracy {sklearn.metrics.accuracy_score(test_y, predictions):.1%}')\n",
    "print(f'roc_auc {sklearn.metrics.roc_auc_score(test_y, predictions):.1%}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}