Add test for s2c and loo

sanderlab · Jun 15, 2023 · 3504a20 · 3504a20
1 parent 45a9394
commit 3504a20
Show file tree

Hide file tree

Showing 4 changed files with 174 additions and 38 deletions.
diff --git a/cellbox/cellbox/dataset.py b/cellbox/cellbox/dataset.py
@@ -15,7 +15,6 @@
 def factory(cfg):
     """formulate training dataset"""
     # Prepare data
-    print("Hello!")
     if cfg.sparse_data:
         cfg.pert_in = tf.compat.v1.sparse.placeholder(tf.float32, [None, cfg.n_x], name='pert_in')
         cfg.expr_out = tf.compat.v1.sparse.placeholder(tf.float32, [None, cfg.n_x], name='expr_out')

diff --git a/notebooks/dataloader.ipynb b/notebooks/dataloader.ipynb
@@ -10,9 +10,52 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-06-14 23:54:04.431174: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA\n",
+      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2023-06-14 23:54:04.628344: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2023-06-14 23:54:04.634777: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cm/shared/apps/lsf10/10.1/linux3.10-glibc2.17-x86_64/lib:/data/weirauchlab/opt/lib:/data/weirauchlab/opt/lib64:/data/weirauchlab/local/lib\n",
+      "2023-06-14 23:54:04.634820: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n",
+      "2023-06-14 23:54:09.211164: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cm/shared/apps/lsf10/10.1/linux3.10-glibc2.17-x86_64/lib:/data/weirauchlab/opt/lib:/data/weirauchlab/opt/lib64:/data/weirauchlab/local/lib\n",
+      "2023-06-14 23:54:09.212455: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cm/shared/apps/lsf10/10.1/linux3.10-glibc2.17-x86_64/lib:/data/weirauchlab/opt/lib:/data/weirauchlab/opt/lib64:/data/weirauchlab/local/lib\n",
+      "2023-06-14 23:54:09.212486: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From /users/ngun7t/anaconda3/envs/cellbox-3.6-2/lib/python3.8/site-packages/tensorflow/python/compat/v2_compat.py:107: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "non-resource variables are not supported in the long term\n",
+      "================================================================================\n",
+      "   _____     _ _ ____              \n",
+      "  / ____|   | | |  _ \\             \n",
+      " | |     ___| | | |_) | _____  __  \n",
+      " | |    / _ \\ | |  _ < / _ \\ \\/ /  \n",
+      " | |___|  __/ | | |_) | (_) >  <   \n",
+      "  \\_____\\___|_|_|____/ \\___/_/\\_\\  \n",
+      "Running CellBox scripts developed in Sander lab\n",
+      "Maintained by Bo Yuan, Judy Shen, and Augustin Luna; contributions by Daniel Ritter\n",
+      "\n",
+      "        version 0.3.2\n",
+      "        -- Feb 10, 2023 --\n",
+      "        * Modify CellBox to support TF2     \n",
+      "        \n",
+      "Tutorials and documentations are available at https://github.com/sanderlab/CellBox\n",
+      "If you want to discuss the usage or to report a bug, please use the 'Issues' function at GitHub.\n",
+      "If you find CellBox useful for your research, please consider citing the corresponding publication.\n",
+      "For more information, please email us at [email protected] and [email protected], [email protected]\n",
+      " --------------------------------------------------------------------------------\n"
+     ]
+    }
+   ],
    "source": [
     "import cellbox\n",
     "import os\n",
@@ -35,15 +78,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'experiment_id': 'Example_S2C', 'model_prefix': 'seed', 'ckpt_name': 'model11.ckpt', 'export_verbose': 3, 'experiment_type': 'single to combo', 'sparse_data': False, 'batchsize': 16, 'trainset_ratio': 0.7, 'validset_ratio': 0.8, 'n_batches_eval': None, 'add_noise_level': 0, 'dT': 0.1, 'ode_solver': 'heun', 'envelope_form': 'tanh', 'envelope': 0, 'pert_form': 'by u', 'ode_degree': 1, 'ode_last_steps': 2, 'n_iter_buffer': 50, 'n_iter_patience': 100, 'weight_loss': 'None', 'l1lambda': 0.0001, 'l2lambda': 0.0001, 'model': 'CellBox', 'pert_file': '/users/ngun7t/Documents/cellbox-jun-6/data/pert_ind_last_col.csv', 'expr_file': '/users/ngun7t/Documents/cellbox-jun-6/data/expr_ind_last_col.csv', 'node_index_file': '/users/ngun7t/Documents/cellbox-jun-6/data/node_Index.csv', 'n_protein_nodes': 82, 'n_activity_nodes': 87, 'n_x': 100, 'envelop_form': 'tanh', 'envelop': 0, 'n_epoch': 10000, 'n_iter': 10000, 'stages': [{'nT': 100, 'sub_stages': [{'lr_val': 0.1, 'l1lambda': 0.01, 'n_iter_patience': 1000}, {'lr_val': 0.01, 'l1lambda': 0.01}, {'lr_val': 0.01, 'l1lambda': 0.0001}, {'lr_val': 0.001, 'l1lambda': 1e-05}]}, {'nT': 200, 'sub_stages': [{'lr_val': 0.001, 'l1lambda': 0.0001}]}, {'nT': 400, 'sub_stages': [{'lr_val': 0.001, 'l1lambda': 0.0001}]}], 'ckpt_path_full': './model11.ckpt', 'drug_index': 5, 'seed': 1000}\n",
-      "Working directory is ready at results/Example_S2C_66d9ccfc914f27490ca1b771e339ab37.\n",
+      "{'experiment_id': 'Example_LOO', 'model_prefix': 'drug', 'ckpt_name': 'model11.ckpt', 'export_verbose': 3, 'experiment_type': 'leave one out (w/o single)', 'sparse_data': False, 'batchsize': 16, 'trainset_ratio': 0.7, 'validset_ratio': 0.8, 'n_batches_eval': None, 'add_noise_level': 0, 'dT': 0.1, 'ode_solver': 'heun', 'envelope_form': 'tanh', 'envelope': 0, 'pert_form': 'by u', 'ode_degree': 1, 'ode_last_steps': 2, 'n_iter_buffer': 50, 'n_iter_patience': 100, 'weight_loss': 'None', 'l1lambda': 0.0001, 'l2lambda': 0.0001, 'model': 'CellBox', 'pert_file': '/users/ngun7t/Documents/cellbox-jun-6/data/pert_ind_last_col.csv', 'expr_file': '/users/ngun7t/Documents/cellbox-jun-6/data/expr_ind_last_col.csv', 'node_index_file': '/users/ngun7t/Documents/cellbox-jun-6/data/node_Index.csv', 'n_protein_nodes': 82, 'n_activity_nodes': 87, 'n_x': 100, 'envelop_form': 'tanh', 'envelop': 0, 'n_epoch': 10000, 'n_iter': 10000, 'stages': [{'nT': 100, 'sub_stages': [{'lr_val': 0.1, 'l1lambda': 0.01, 'n_iter_patience': 1000}, {'lr_val': 0.01, 'l1lambda': 0.01}, {'lr_val': 0.01, 'l1lambda': 0.0001}, {'lr_val': 0.001, 'l1lambda': 1e-05}]}, {'nT': 200, 'sub_stages': [{'lr_val': 0.001, 'l1lambda': 0.0001}]}, {'nT': 400, 'sub_stages': [{'lr_val': 0.001, 'l1lambda': 0.0001}]}], 'ckpt_path_full': './model11.ckpt', 'drug_index': 4, 'seed': 1000}\n",
+      "Working directory is ready at results/Example_LOO_a7102a7e8a4ad3c23e9eca13cab65b6f.\n",
+      "Hello!\n",
+      "Hello!\n",
       "Hello!\n"
      ]
     }
@@ -93,7 +138,7 @@
     "    print('Working directory is ready at {}.'.format(experiment_path))\n",
     "    return 0\n",
     "\n",
-    "experiment_config_path = \"/users/ngun7t/Documents/cellbox-jun-6/configs_dev/Example.single_to_combo.json\"\n",
+    "experiment_config_path = \"/users/ngun7t/Documents/cellbox-jun-6/configs_dev/Example.leave_one_drug_out.json\"\n",
     "working_index = 0\n",
     "stage = {\n",
     "    \"nT\": 100,\n",
@@ -107,7 +152,7 @@
     "cfg = cellbox.config.Config(experiment_config_path)\n",
     "cfg.ckpt_path_full = os.path.join('./', cfg.ckpt_name)\n",
     "md5 = cellbox.utils.md5(cfg)\n",
-    "cfg.drug_index = 5         # Change this for testing purposes\n",
+    "cfg.drug_index = 4         # Change this for testing purposes\n",
     "cfg.seed = working_index + cfg.seed if hasattr(cfg, \"seed\") else working_index + 1000\n",
     "set_seed(cfg.seed)\n",
     "print(vars(cfg))\n",
@@ -121,21 +166,21 @@
     "    #args.sub_stages = stage['sub_stages']\n",
     "    #args.n_T = stage['nT']\n",
     "    #model = cellbox.model.factory(args)\n",
-    "    if i == 0: break"
+    "    if i == 2: break"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "(18, 100)\n",
-      "(5, 100)\n",
-      "(66, 100)\n",
+      "(60, 100)\n",
+      "(16, 100)\n",
+      "(13, 100)\n",
       "<class 'numpy.ndarray'>\n"
      ]
     }
@@ -162,7 +207,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -187,7 +232,7 @@
     "    true_drug_index = drug_indices_map[drug_index]\n",
     "    loo_label = pd.read_csv(\"/users/ngun7t/Documents/cellbox-jun-6/data/loo_label.csv\", header=None)\n",
     "    ind_rows = loo_label.index[(loo_label[[0, 1]] == true_drug_index).any(axis=1)].tolist()\n",
-    "    return np.array(ind_rows) - 1\n",
+    "    return np.array(ind_rows)\n",
     "\n",
     "drug_indices_map = populate_drug_indices_map()\n",
     "if cfg.experiment_type == \"leave one out (w/o single)\":\n",
@@ -198,7 +243,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -207,7 +252,7 @@
        "[0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 8, 9]"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -241,7 +286,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [
     {
@@ -280,16 +325,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "2"
+       "4"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 34,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -300,22 +345,44 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([ 2, 15, 25, 29, 30, 31, 32, 33, 36, 45, 58, 72, 81])"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "inds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Ind: [41. 76. 11. 34. 24. 66. 23. 29. 42. 70. 79. 43. 51. 69. 54. 53.] shares the ind that contains the drug index: [11. 23. 24. 29. 34. 41. 42. 43. 51. 53. 54. 66. 69. 70. 76. 79.]\n",
-      "Ind: [88. 12.] shares the ind that contains the drug index: [12. 88.]\n"
+      "Ind: [48. 82. 63. 67.  7. 28. 64.  9. 20. 74. 80. 12. 83. 11. 37. 41.] shares the ind that contains the drug index: []\n",
+      "Ind: [ 8.  6. 78. 18. 77. 60. 66. 56. 59. 68. 65. 70. 13. 86. 44.  3.] shares the ind that contains the drug index: []\n",
+      "Ind: [85. 54. 23. 49. 43.  4. 57. 26. 35. 40. 17. 88. 16. 39. 75. 10.] shares the ind that contains the drug index: []\n",
+      "Ind: [24. 19. 14. 27. 53. 34. 47.  5. 87. 21. 42. 38.] shares the ind that contains the drug index: []\n"
      ]
     }
    ],
    "source": [
     "for pert in items_pert:\n",
     "    ind = pert[:, -1]\n",
-    "    print(f\"Ind: {ind} shares the ind that contains the drug index: {np.intersect1d(ind, rows_with_single_drugs)}\")\n",
+    "    print(f\"Ind: {ind} shares the ind that contains the drug index: {np.intersect1d(ind, inds)}\")\n",
     "    \n",
     "#cfg.drug_index"
    ]

diff --git a/test.py b/test.py
@@ -4,15 +4,14 @@
 import numpy as np
 
 from test_utils.dataloader import get_dataloader, yield_data_from_tensorflow_dataloader, yield_data_from_pytorch_dataloader, \
-    s2c_row_inds
+    s2c_row_inds, loo_row_inds
 
 
 #def test_model():
 #    os.system('python scripts/main.py -config=configs/Example.minimal.json')
 #    files = glob.glob('results/Debugging_*/seed_000/3_best.W*')
 #    assert False
 
-
 #################################################### Tests for DataLoaders ####################################################
 
 # Test for correct shape
@@ -21,8 +20,8 @@ def test_correct_shape():
     A function to test if the batch yielded by both Tensorflow and Pytorch has the same shape
     """
     experiment_config_path = "/users/ngun7t/Documents/cellbox-jun-6/configs_dev/Example.random_partition.json"
-    tensorflow_dataloader_list = get_dataloader(experiment_config_path, tensorflow_code=True)
-    pytorch_dataloader_list = get_dataloader(experiment_config_path, tensorflow_code=False)
+    tensorflow_dataloader_list, cfg = get_dataloader(experiment_config_path, tensorflow_code=True)
+    pytorch_dataloader_list, cfg = get_dataloader(experiment_config_path, tensorflow_code=False)
 
     # Code to extract the shape of each yield
     for tf_dict, torch_dict in zip(tensorflow_dataloader_list, pytorch_dataloader_list):
@@ -47,15 +46,15 @@ def test_correct_shape():
             assert tf_arr.shape == np.array(torch_arr).shape, f"For expr batches, shape of tf batch = {tf_arr.shape} is not equal to shape of torch batch = {np.array(torch_arr).shape}"
 
 
-# Test for correct input
+# Test for correct input rows for single-to-combo
 def test_single_to_combo():
     """
     A function to test if pytorch and tensorflow dataloaders yield the correct rows in the dataset for s2c experiment
     """
     experiment_config_path = "/users/ngun7t/Documents/cellbox-jun-6/configs_dev/Example.single_to_combo.json"
     loo_label_dir = "/users/ngun7t/Documents/cellbox-jun-6/data/loo_label.csv"
-    tensorflow_dataloader_list = get_dataloader(experiment_config_path, tensorflow_code=True)
-    pytorch_dataloader_list = get_dataloader(experiment_config_path, tensorflow_code=False)
+    tensorflow_dataloader_list, cfg = get_dataloader(experiment_config_path, tensorflow_code=True)
+    pytorch_dataloader_list, cfg = get_dataloader(experiment_config_path, tensorflow_code=False)
 
     # Get the row index that contains single drugs
     rows_with_single_drugs, rows_with_multiple_drugs = s2c_row_inds(loo_label_dir)
@@ -84,6 +83,46 @@ def test_single_to_combo():
             assert tf_arr.shape == np.array(torch_arr).shape, f"For expr batches, shape of tf batch = {tf_arr.shape} is not equal to shape of torch batch = {np.array(torch_arr).shape}"
             assert np.intersect1d(tf_arr[:, -1], rows_with_multiple_drugs).size == 0, f"batches for tf train set contains data rows that has multiple drugs in s2c mode"
             assert np.intersect1d(torch_arr[:, -1], rows_with_multiple_drugs).size == 0, f"batches for torch train set contains data rows that has multiple drugs in s2c mode"
+
+
+# Test for correct input rows for leave-one-out
+@pytest.mark.parametrize("drug_index", list(range(0, 13)))
+def test_leave_one_out(drug_index):
+    """
+    A function to test if pytorch and tensorflow dataloaders yield the correct rows in the dataset for leave-one-out experiment
+    """
+    experiment_config_path = "/users/ngun7t/Documents/cellbox-jun-6/configs_dev/Example.leave_one_drug_out.json"
+    loo_label_dir = "/users/ngun7t/Documents/cellbox-jun-6/data/loo_label.csv"
+    tensorflow_dataloader_list, cfg = get_dataloader(experiment_config_path, tensorflow_code=True, drug_index=drug_index)
+    pytorch_dataloader_list, cfg = get_dataloader(experiment_config_path, tensorflow_code=False, drug_index=drug_index)
+
+    # Get the row index that contains single drugs
+    rows_with_left_out_drugs = loo_row_inds(loo_label_dir, cfg)
+
+    # Code to extract the shape of each yield
+    for tf_dict, torch_dict in zip(tensorflow_dataloader_list, pytorch_dataloader_list):
+        tf_train_pert, tf_train_expr = yield_data_from_tensorflow_dataloader(
+            dataloader=tf_dict["iter_train"],
+            feed_dict=tf_dict["feed_dict"]
+        )
+        torch_train_pert, torch_train_expr = yield_data_from_pytorch_dataloader(
+            dataloader=torch_dict["iter_train"]
+        )
+        # Assert that the count of batches obtained is equal
+        assert len(tf_train_pert) == len(torch_train_pert), "Length of number of arrays yield for train pert not equal"
+        assert len(tf_train_expr) == len(torch_train_expr), "Length of number of arrays yield for train expr not equal"
+
+        # Assert that the shape of each batch is equal, and also it contains the correct row index
+        for tf_arr, torch_arr in zip(tf_train_pert, torch_train_pert):
+            assert tf_arr.shape == np.array(torch_arr).shape, f"For pert batches, shape of tf batch = {tf_arr.shape} is not equal to shape of torch batch = {np.array(torch_arr).shape}"
+            assert np.intersect1d(tf_arr[:, -1], rows_with_left_out_drugs).size == 0, f"batches for tf train set contains data rows that has left-out drugs in loo mode"
+            assert np.intersect1d(torch_arr[:, -1], rows_with_left_out_drugs).size == 0, f"batches for torch train set contains data rows that has left-out drugs in loo mode"
+
+        # Assert that the shape of each batch is equal
+        for tf_arr, torch_arr in zip(tf_train_expr, torch_train_expr):
+            assert tf_arr.shape == np.array(torch_arr).shape, f"For expr batches, shape of tf batch = {tf_arr.shape} is not equal to shape of torch batch = {np.array(torch_arr).shape}"
+            assert np.intersect1d(tf_arr[:, -1], rows_with_left_out_drugs).size == 0, f"batches for tf train set contains data rows that has left-out drugs in loo mode"
+            assert np.intersect1d(torch_arr[:, -1], rows_with_left_out_drugs).size == 0, f"batches for torch train set contains data rows that has left-out drugs in loo mode"
 
 
 if __name__ == '__main__':