From 37121506ae804baf43f173ec6f6bd722da14867c Mon Sep 17 00:00:00 2001
From: WenqinGan <wxg39@psu.edu>
Date: Fri, 27 Jan 2023 06:47:35 +0000
Subject: [PATCH] finished basic visualization

---
 flaml/automl/automl.py   |  47 ++++++++++++++
 notebook/visualize.ipynb | 136 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 183 insertions(+)
 create mode 100644 notebook/visualize.ipynb
diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py
index 1b6bef199b..e28c600f97 100644
--- a/flaml/automl/automl.py
+++ b/flaml/automl/automl.py
@@ -3779,6 +3779,53 @@ def _search(self):
                 else:
                     logger.info("not retraining because the time budget is too small.")
 
+    def visualize(self):
+        """
+        Show an interative dashboard widget for a trained AutoML instance.
+        Must be called after fit(...).
+        """
+
+        import matplotlib.pyplot as plt
+        from ipywidgets import interact
+
+        @interact
+        def helper(option=["Feature Importance", "Learning Curve"]):
+            if option == "Feature Importance":
+                plt.barh(self.feature_names_in_, self.feature_importances_)
+                plt.xlabel("Feature Importance")
+                plt.ylabel("Feature")
+                plt.show()
+
+            if option == "Learning Curve":
+                from flaml.data import get_output_from_log
+
+                log_file_name = self._settings.get("log_file_name")
+                if not log_file_name:
+                    logger.warning(
+                        "Log file for this instance not found. Unable to visualize learning curve."
+                    )
+                else:
+                    (
+                        time_history,
+                        best_valid_loss_history,
+                        valid_loss_history,
+                        config_history,
+                        metric_history,
+                    ) = get_output_from_log(filename=log_file_name, time_budget=240)
+
+                    plt.title("Learning Curve")
+                    plt.xlabel("Wall Clock Time (s)")
+                    plt.ylabel("Validation Accuracy")
+                    plt.scatter(time_history, 1 - np.array(valid_loss_history))
+                    plt.step(
+                        time_history,
+                        1 - np.array(best_valid_loss_history),
+                        where="post",
+                    )
+                    plt.show()
+
+        helper()
+
     def __del__(self):
         if (
             hasattr(self, "_trained_estimator")
diff --git a/notebook/visualize.ipynb b/notebook/visualize.ipynb
new file mode 100644
index 0000000000..d60275cb09
--- /dev/null
+++ b/notebook/visualize.ipynb
@@ -0,0 +1,136 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<jemalloc>: MADV_DONTNEED does not work (memset will be used instead)\n",
+      "<jemalloc>: (This is the expected behaviour if you are running under QEMU)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Enabling notebook extension jupyter-js-widgets/extension...\n",
+      "      - Validating: \u001b[32mOK\u001b[0m\n",
+      "load dataset from ./openml_ds1169.pkl\n",
+      "Dataset name: airlines\n",
+      "X_train.shape: (404537, 7), y_train.shape: (404537,);\n",
+      "X_test.shape: (134846, 7), y_test.shape: (134846,)\n",
+      "[flaml.automl.automl: 01-27 05:48:20] {2715} INFO - task = classification\n",
+      "[flaml.automl.automl: 01-27 05:48:20] {2717} INFO - Data split method: stratified\n",
+      "[flaml.automl.automl: 01-27 05:48:20] {2720} INFO - Evaluation method: holdout\n",
+      "[flaml.automl.automl: 01-27 05:48:22] {2847} INFO - Minimizing error metric: 1-accuracy\n",
+      "[flaml.automl.automl: 01-27 05:48:22] {2993} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']\n",
+      "[flaml.automl.automl: 01-27 05:48:22] {3322} INFO - iteration 0, current learner lgbm\n",
+      "[flaml.automl.automl: 01-27 05:48:23] {3461} INFO - Estimated sufficient time budget=275476s. Estimated necessary time budget=6763s.\n",
+      "[flaml.automl.automl: 01-27 05:48:23] {3513} INFO -  at 4.4s,\testimator lgbm's best error=0.4459,\tbest estimator lgbm's best error=0.4459\n",
+      "[flaml.automl.automl: 01-27 05:48:23] {3322} INFO - iteration 1, current learner lgbm\n",
+      "[flaml.automl.automl: 01-27 05:48:23] {3513} INFO -  at 4.9s,\testimator lgbm's best error=0.4459,\tbest estimator lgbm's best error=0.4459\n",
+      "[flaml.automl.automl: 01-27 05:48:23] {3322} INFO - iteration 2, current learner lgbm\n",
+      "[flaml.automl.automl: 01-27 05:48:24] {3513} INFO -  at 5.4s,\testimator lgbm's best error=0.3777,\tbest estimator lgbm's best error=0.3777\n",
+      "[flaml.automl.automl: 01-27 05:48:24] {3322} INFO - iteration 3, current learner lgbm\n",
+      "[flaml.automl.automl: 01-27 05:48:24] {3513} INFO -  at 5.8s,\testimator lgbm's best error=0.3777,\tbest estimator lgbm's best error=0.3777\n",
+      "[flaml.automl.automl: 01-27 05:48:24] {3322} INFO - iteration 4, current learner lgbm\n",
+      "[flaml.automl.automl: 01-27 05:48:24] {3513} INFO -  at 6.0s,\testimator lgbm's best error=0.3777,\tbest estimator lgbm's best error=0.3777\n",
+      "[flaml.automl.automl: 01-27 05:48:24] {3322} INFO - iteration 5, current learner lgbm\n",
+      "[flaml.automl.automl: 01-27 05:48:25] {3513} INFO -  at 6.4s,\testimator lgbm's best error=0.3777,\tbest estimator lgbm's best error=0.3777\n",
+      "[flaml.automl.automl: 01-27 05:48:25] {3322} INFO - iteration 6, current learner lgbm\n",
+      "[flaml.automl.automl: 01-27 05:48:25] {3513} INFO -  at 6.9s,\testimator lgbm's best error=0.3777,\tbest estimator lgbm's best error=0.3777\n",
+      "[flaml.automl.automl: 01-27 05:48:25] {3322} INFO - iteration 7, current learner xgboost\n",
+      "[flaml.automl.automl: 01-27 05:48:26] {3513} INFO -  at 8.0s,\testimator xgboost's best error=0.3787,\tbest estimator lgbm's best error=0.3777\n",
+      "[flaml.automl.automl: 01-27 05:48:26] {3322} INFO - iteration 8, current learner lgbm\n",
+      "[flaml.automl.automl: 01-27 05:48:27] {3513} INFO -  at 8.6s,\testimator lgbm's best error=0.3777,\tbest estimator lgbm's best error=0.3777\n",
+      "[flaml.automl.automl: 01-27 05:48:27] {3322} INFO - iteration 9, current learner xgboost\n",
+      "[flaml.automl.automl: 01-27 05:48:28] {3513} INFO -  at 9.6s,\testimator xgboost's best error=0.3787,\tbest estimator lgbm's best error=0.3777\n",
+      "[flaml.automl.automl: 01-27 05:48:28] {3322} INFO - iteration 10, current learner extra_tree\n",
+      "[flaml.automl.automl: 01-27 05:48:28] {3513} INFO -  at 9.9s,\testimator extra_tree's best error=0.4459,\tbest estimator lgbm's best error=0.3777\n",
+      "[flaml.automl.automl: 01-27 05:48:28] {3322} INFO - iteration 11, current learner rf\n",
+      "[flaml.automl.automl: 01-27 05:48:28] {3513} INFO -  at 10.2s,\testimator rf's best error=0.4421,\tbest estimator lgbm's best error=0.3777\n",
+      "[flaml.automl.automl: 01-27 05:48:31] {3773} INFO - retrain lgbm for 2.3s\n",
+      "[flaml.automl.automl: 01-27 05:48:31] {3778} INFO - retrained model: LGBMClassifier(learning_rate=0.26770501231052046, max_bin=127,\n",
+      "               min_child_samples=12, n_estimators=4, num_leaves=4,\n",
+      "               reg_alpha=0.001348364934537134, reg_lambda=1.4442580148221913,\n",
+      "               verbose=-1)\n",
+      "[flaml.automl.automl: 01-27 05:48:31] {3023} INFO - fit succeeded\n",
+      "[flaml.automl.automl: 01-27 05:48:31] {3025} INFO - Time taken to find the best model: 5.436654806137085\n"
+     ]
+    }
+   ],
+   "source": [
+    "from flaml.automl.automl import AutoML\n",
+    "from flaml.data import load_openml_dataset\n",
+    "!jupyter nbextension enable --py widgetsnbextension\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir=\"./\")\n",
+    "settings = {\n",
+    "    \"time_budget\": 10,  # total running time in seconds\n",
+    "    \"metric\": \"accuracy\",  # can be: 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',\n",
+    "    # 'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg', 'micro_f1', 'macro_f1'\n",
+    "    \"task\": \"classification\",  # task type\n",
+    "    \"log_file_name\": \"airlines_experiment.log\",  # flaml log file\n",
+    "    \"seed\": 7654321,  # random seed\n",
+    "}\n",
+    "automl = AutoML(**settings)\n",
+    "automl.fit(X_train=X_train, y_train=y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a652c19b24404f6cb9d3f2938b033ec8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "interactive(children=(Dropdown(description='option', options=('Feature Importance', 'Learning Curve'), value='…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "automl.visualize()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.16"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}