diff --git a/cellbox/cellbox/dataset.py b/cellbox/cellbox/dataset.py index 572a89f..5c8c663 100644 --- a/cellbox/cellbox/dataset.py +++ b/cellbox/cellbox/dataset.py @@ -15,7 +15,6 @@ def factory(cfg): """formulate training dataset""" # Prepare data - print("Hello!") if cfg.sparse_data: cfg.pert_in = tf.compat.v1.sparse.placeholder(tf.float32, [None, cfg.n_x], name='pert_in') cfg.expr_out = tf.compat.v1.sparse.placeholder(tf.float32, [None, cfg.n_x], name='expr_out') diff --git a/notebooks/dataloader.ipynb b/notebooks/dataloader.ipynb index 670ed0b..57f6980 100644 --- a/notebooks/dataloader.ipynb +++ b/notebooks/dataloader.ipynb @@ -10,9 +10,52 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-06-14 23:54:04.431174: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-06-14 23:54:04.628344: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", + "2023-06-14 23:54:04.634777: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cm/shared/apps/lsf10/10.1/linux3.10-glibc2.17-x86_64/lib:/data/weirauchlab/opt/lib:/data/weirauchlab/opt/lib64:/data/weirauchlab/local/lib\n", + "2023-06-14 23:54:04.634820: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n", + "2023-06-14 23:54:09.211164: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cm/shared/apps/lsf10/10.1/linux3.10-glibc2.17-x86_64/lib:/data/weirauchlab/opt/lib:/data/weirauchlab/opt/lib64:/data/weirauchlab/local/lib\n", + "2023-06-14 23:54:09.212455: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cm/shared/apps/lsf10/10.1/linux3.10-glibc2.17-x86_64/lib:/data/weirauchlab/opt/lib:/data/weirauchlab/opt/lib64:/data/weirauchlab/local/lib\n", + "2023-06-14 23:54:09.212486: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /users/ngun7t/anaconda3/envs/cellbox-3.6-2/lib/python3.8/site-packages/tensorflow/python/compat/v2_compat.py:107: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "non-resource variables are not supported in the long term\n", + "================================================================================\n", + " _____ _ _ ____ \n", + " / ____| | | | _ \\ \n", + " | | ___| | | |_) | _____ __ \n", + " | | / _ \\ | | _ < / _ \\ \\/ / \n", + " | |___| __/ | | |_) | (_) > < \n", + " \\_____\\___|_|_|____/ \\___/_/\\_\\ \n", + "Running CellBox scripts developed in Sander lab\n", + "Maintained by Bo Yuan, Judy Shen, and Augustin Luna; contributions by Daniel Ritter\n", + "\n", + " version 0.3.2\n", + " -- Feb 10, 2023 --\n", + " * Modify CellBox to support TF2 \n", + " \n", + "Tutorials and documentations are available at https://github.com/sanderlab/CellBox\n", + "If you want to discuss the usage or to report a bug, please use the 'Issues' function at GitHub.\n", + "If you find CellBox useful for your research, please consider citing the corresponding publication.\n", + "For more information, please email us at boyuan@g.harvard.edu and c_shen@g.harvard.edu, augustin_luna@hms.harvard.edu\n", + " --------------------------------------------------------------------------------\n" + ] + } + ], "source": [ "import cellbox\n", "import os\n", @@ -35,15 +78,17 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'experiment_id': 'Example_S2C', 'model_prefix': 'seed', 'ckpt_name': 'model11.ckpt', 'export_verbose': 3, 'experiment_type': 'single to combo', 'sparse_data': False, 'batchsize': 16, 'trainset_ratio': 0.7, 'validset_ratio': 0.8, 'n_batches_eval': None, 'add_noise_level': 0, 'dT': 0.1, 'ode_solver': 'heun', 'envelope_form': 'tanh', 'envelope': 0, 'pert_form': 'by u', 'ode_degree': 1, 'ode_last_steps': 2, 'n_iter_buffer': 50, 'n_iter_patience': 100, 'weight_loss': 'None', 'l1lambda': 0.0001, 'l2lambda': 0.0001, 'model': 'CellBox', 'pert_file': '/users/ngun7t/Documents/cellbox-jun-6/data/pert_ind_last_col.csv', 'expr_file': '/users/ngun7t/Documents/cellbox-jun-6/data/expr_ind_last_col.csv', 'node_index_file': '/users/ngun7t/Documents/cellbox-jun-6/data/node_Index.csv', 'n_protein_nodes': 82, 'n_activity_nodes': 87, 'n_x': 100, 'envelop_form': 'tanh', 'envelop': 0, 'n_epoch': 10000, 'n_iter': 10000, 'stages': [{'nT': 100, 'sub_stages': [{'lr_val': 0.1, 'l1lambda': 0.01, 'n_iter_patience': 1000}, {'lr_val': 0.01, 'l1lambda': 0.01}, {'lr_val': 0.01, 'l1lambda': 0.0001}, {'lr_val': 0.001, 'l1lambda': 1e-05}]}, {'nT': 200, 'sub_stages': [{'lr_val': 0.001, 'l1lambda': 0.0001}]}, {'nT': 400, 'sub_stages': [{'lr_val': 0.001, 'l1lambda': 0.0001}]}], 'ckpt_path_full': './model11.ckpt', 'drug_index': 5, 'seed': 1000}\n", - "Working directory is ready at results/Example_S2C_66d9ccfc914f27490ca1b771e339ab37.\n", + "{'experiment_id': 'Example_LOO', 'model_prefix': 'drug', 'ckpt_name': 'model11.ckpt', 'export_verbose': 3, 'experiment_type': 'leave one out (w/o single)', 'sparse_data': False, 'batchsize': 16, 'trainset_ratio': 0.7, 'validset_ratio': 0.8, 'n_batches_eval': None, 'add_noise_level': 0, 'dT': 0.1, 'ode_solver': 'heun', 'envelope_form': 'tanh', 'envelope': 0, 'pert_form': 'by u', 'ode_degree': 1, 'ode_last_steps': 2, 'n_iter_buffer': 50, 'n_iter_patience': 100, 'weight_loss': 'None', 'l1lambda': 0.0001, 'l2lambda': 0.0001, 'model': 'CellBox', 'pert_file': '/users/ngun7t/Documents/cellbox-jun-6/data/pert_ind_last_col.csv', 'expr_file': '/users/ngun7t/Documents/cellbox-jun-6/data/expr_ind_last_col.csv', 'node_index_file': '/users/ngun7t/Documents/cellbox-jun-6/data/node_Index.csv', 'n_protein_nodes': 82, 'n_activity_nodes': 87, 'n_x': 100, 'envelop_form': 'tanh', 'envelop': 0, 'n_epoch': 10000, 'n_iter': 10000, 'stages': [{'nT': 100, 'sub_stages': [{'lr_val': 0.1, 'l1lambda': 0.01, 'n_iter_patience': 1000}, {'lr_val': 0.01, 'l1lambda': 0.01}, {'lr_val': 0.01, 'l1lambda': 0.0001}, {'lr_val': 0.001, 'l1lambda': 1e-05}]}, {'nT': 200, 'sub_stages': [{'lr_val': 0.001, 'l1lambda': 0.0001}]}, {'nT': 400, 'sub_stages': [{'lr_val': 0.001, 'l1lambda': 0.0001}]}], 'ckpt_path_full': './model11.ckpt', 'drug_index': 4, 'seed': 1000}\n", + "Working directory is ready at results/Example_LOO_a7102a7e8a4ad3c23e9eca13cab65b6f.\n", + "Hello!\n", + "Hello!\n", "Hello!\n" ] } @@ -93,7 +138,7 @@ " print('Working directory is ready at {}.'.format(experiment_path))\n", " return 0\n", "\n", - "experiment_config_path = \"/users/ngun7t/Documents/cellbox-jun-6/configs_dev/Example.single_to_combo.json\"\n", + "experiment_config_path = \"/users/ngun7t/Documents/cellbox-jun-6/configs_dev/Example.leave_one_drug_out.json\"\n", "working_index = 0\n", "stage = {\n", " \"nT\": 100,\n", @@ -107,7 +152,7 @@ "cfg = cellbox.config.Config(experiment_config_path)\n", "cfg.ckpt_path_full = os.path.join('./', cfg.ckpt_name)\n", "md5 = cellbox.utils.md5(cfg)\n", - "cfg.drug_index = 5 # Change this for testing purposes\n", + "cfg.drug_index = 4 # Change this for testing purposes\n", "cfg.seed = working_index + cfg.seed if hasattr(cfg, \"seed\") else working_index + 1000\n", "set_seed(cfg.seed)\n", "print(vars(cfg))\n", @@ -121,21 +166,21 @@ " #args.sub_stages = stage['sub_stages']\n", " #args.n_T = stage['nT']\n", " #model = cellbox.model.factory(args)\n", - " if i == 0: break" + " if i == 2: break" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(18, 100)\n", - "(5, 100)\n", - "(66, 100)\n", + "(60, 100)\n", + "(16, 100)\n", + "(13, 100)\n", "\n" ] } @@ -162,7 +207,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -187,7 +232,7 @@ " true_drug_index = drug_indices_map[drug_index]\n", " loo_label = pd.read_csv(\"/users/ngun7t/Documents/cellbox-jun-6/data/loo_label.csv\", header=None)\n", " ind_rows = loo_label.index[(loo_label[[0, 1]] == true_drug_index).any(axis=1)].tolist()\n", - " return np.array(ind_rows) - 1\n", + " return np.array(ind_rows)\n", "\n", "drug_indices_map = populate_drug_indices_map()\n", "if cfg.experiment_type == \"leave one out (w/o single)\":\n", @@ -198,7 +243,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -207,7 +252,7 @@ "[0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 8, 9]" ] }, - "execution_count": 5, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -241,7 +286,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -280,16 +325,16 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "2" + "4" ] }, - "execution_count": 23, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -300,22 +345,44 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 2, 15, 25, 29, 30, 31, 32, 33, 36, 45, 58, 72, 81])" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inds" + ] + }, + { + "cell_type": "code", + "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Ind: [41. 76. 11. 34. 24. 66. 23. 29. 42. 70. 79. 43. 51. 69. 54. 53.] shares the ind that contains the drug index: [11. 23. 24. 29. 34. 41. 42. 43. 51. 53. 54. 66. 69. 70. 76. 79.]\n", - "Ind: [88. 12.] shares the ind that contains the drug index: [12. 88.]\n" + "Ind: [48. 82. 63. 67. 7. 28. 64. 9. 20. 74. 80. 12. 83. 11. 37. 41.] shares the ind that contains the drug index: []\n", + "Ind: [ 8. 6. 78. 18. 77. 60. 66. 56. 59. 68. 65. 70. 13. 86. 44. 3.] shares the ind that contains the drug index: []\n", + "Ind: [85. 54. 23. 49. 43. 4. 57. 26. 35. 40. 17. 88. 16. 39. 75. 10.] shares the ind that contains the drug index: []\n", + "Ind: [24. 19. 14. 27. 53. 34. 47. 5. 87. 21. 42. 38.] shares the ind that contains the drug index: []\n" ] } ], "source": [ "for pert in items_pert:\n", " ind = pert[:, -1]\n", - " print(f\"Ind: {ind} shares the ind that contains the drug index: {np.intersect1d(ind, rows_with_single_drugs)}\")\n", + " print(f\"Ind: {ind} shares the ind that contains the drug index: {np.intersect1d(ind, inds)}\")\n", " \n", "#cfg.drug_index" ] diff --git a/test.py b/test.py index 33fb835..10d6fd6 100644 --- a/test.py +++ b/test.py @@ -4,7 +4,7 @@ import numpy as np from test_utils.dataloader import get_dataloader, yield_data_from_tensorflow_dataloader, yield_data_from_pytorch_dataloader, \ - s2c_row_inds + s2c_row_inds, loo_row_inds #def test_model(): @@ -12,7 +12,6 @@ # files = glob.glob('results/Debugging_*/seed_000/3_best.W*') # assert False - #################################################### Tests for DataLoaders #################################################### # Test for correct shape @@ -21,8 +20,8 @@ def test_correct_shape(): A function to test if the batch yielded by both Tensorflow and Pytorch has the same shape """ experiment_config_path = "/users/ngun7t/Documents/cellbox-jun-6/configs_dev/Example.random_partition.json" - tensorflow_dataloader_list = get_dataloader(experiment_config_path, tensorflow_code=True) - pytorch_dataloader_list = get_dataloader(experiment_config_path, tensorflow_code=False) + tensorflow_dataloader_list, cfg = get_dataloader(experiment_config_path, tensorflow_code=True) + pytorch_dataloader_list, cfg = get_dataloader(experiment_config_path, tensorflow_code=False) # Code to extract the shape of each yield for tf_dict, torch_dict in zip(tensorflow_dataloader_list, pytorch_dataloader_list): @@ -47,15 +46,15 @@ def test_correct_shape(): assert tf_arr.shape == np.array(torch_arr).shape, f"For expr batches, shape of tf batch = {tf_arr.shape} is not equal to shape of torch batch = {np.array(torch_arr).shape}" -# Test for correct input +# Test for correct input rows for single-to-combo def test_single_to_combo(): """ A function to test if pytorch and tensorflow dataloaders yield the correct rows in the dataset for s2c experiment """ experiment_config_path = "/users/ngun7t/Documents/cellbox-jun-6/configs_dev/Example.single_to_combo.json" loo_label_dir = "/users/ngun7t/Documents/cellbox-jun-6/data/loo_label.csv" - tensorflow_dataloader_list = get_dataloader(experiment_config_path, tensorflow_code=True) - pytorch_dataloader_list = get_dataloader(experiment_config_path, tensorflow_code=False) + tensorflow_dataloader_list, cfg = get_dataloader(experiment_config_path, tensorflow_code=True) + pytorch_dataloader_list, cfg = get_dataloader(experiment_config_path, tensorflow_code=False) # Get the row index that contains single drugs rows_with_single_drugs, rows_with_multiple_drugs = s2c_row_inds(loo_label_dir) @@ -84,6 +83,46 @@ def test_single_to_combo(): assert tf_arr.shape == np.array(torch_arr).shape, f"For expr batches, shape of tf batch = {tf_arr.shape} is not equal to shape of torch batch = {np.array(torch_arr).shape}" assert np.intersect1d(tf_arr[:, -1], rows_with_multiple_drugs).size == 0, f"batches for tf train set contains data rows that has multiple drugs in s2c mode" assert np.intersect1d(torch_arr[:, -1], rows_with_multiple_drugs).size == 0, f"batches for torch train set contains data rows that has multiple drugs in s2c mode" + + +# Test for correct input rows for leave-one-out +@pytest.mark.parametrize("drug_index", list(range(0, 13))) +def test_leave_one_out(drug_index): + """ + A function to test if pytorch and tensorflow dataloaders yield the correct rows in the dataset for leave-one-out experiment + """ + experiment_config_path = "/users/ngun7t/Documents/cellbox-jun-6/configs_dev/Example.leave_one_drug_out.json" + loo_label_dir = "/users/ngun7t/Documents/cellbox-jun-6/data/loo_label.csv" + tensorflow_dataloader_list, cfg = get_dataloader(experiment_config_path, tensorflow_code=True, drug_index=drug_index) + pytorch_dataloader_list, cfg = get_dataloader(experiment_config_path, tensorflow_code=False, drug_index=drug_index) + + # Get the row index that contains single drugs + rows_with_left_out_drugs = loo_row_inds(loo_label_dir, cfg) + + # Code to extract the shape of each yield + for tf_dict, torch_dict in zip(tensorflow_dataloader_list, pytorch_dataloader_list): + tf_train_pert, tf_train_expr = yield_data_from_tensorflow_dataloader( + dataloader=tf_dict["iter_train"], + feed_dict=tf_dict["feed_dict"] + ) + torch_train_pert, torch_train_expr = yield_data_from_pytorch_dataloader( + dataloader=torch_dict["iter_train"] + ) + # Assert that the count of batches obtained is equal + assert len(tf_train_pert) == len(torch_train_pert), "Length of number of arrays yield for train pert not equal" + assert len(tf_train_expr) == len(torch_train_expr), "Length of number of arrays yield for train expr not equal" + + # Assert that the shape of each batch is equal, and also it contains the correct row index + for tf_arr, torch_arr in zip(tf_train_pert, torch_train_pert): + assert tf_arr.shape == np.array(torch_arr).shape, f"For pert batches, shape of tf batch = {tf_arr.shape} is not equal to shape of torch batch = {np.array(torch_arr).shape}" + assert np.intersect1d(tf_arr[:, -1], rows_with_left_out_drugs).size == 0, f"batches for tf train set contains data rows that has left-out drugs in loo mode" + assert np.intersect1d(torch_arr[:, -1], rows_with_left_out_drugs).size == 0, f"batches for torch train set contains data rows that has left-out drugs in loo mode" + + # Assert that the shape of each batch is equal + for tf_arr, torch_arr in zip(tf_train_expr, torch_train_expr): + assert tf_arr.shape == np.array(torch_arr).shape, f"For expr batches, shape of tf batch = {tf_arr.shape} is not equal to shape of torch batch = {np.array(torch_arr).shape}" + assert np.intersect1d(tf_arr[:, -1], rows_with_left_out_drugs).size == 0, f"batches for tf train set contains data rows that has left-out drugs in loo mode" + assert np.intersect1d(torch_arr[:, -1], rows_with_left_out_drugs).size == 0, f"batches for torch train set contains data rows that has left-out drugs in loo mode" if __name__ == '__main__': diff --git a/test_utils/dataloader.py b/test_utils/dataloader.py index 6adc5a4..5c7ac32 100644 --- a/test_utils/dataloader.py +++ b/test_utils/dataloader.py @@ -56,19 +56,18 @@ def prepare_workdir(in_cfg, working_index, md5): return 0 -def get_dataloader(experiment_config_path, tensorflow_code=True): +def get_dataloader(experiment_config_path, tensorflow_code=True, drug_index=None): """ Get either the Tensorflow or Pytorch dataloader """ - experiment_config_path = "/users/ngun7t/Documents/cellbox-jun-6/configs_dev/Example.single_to_combo.json" working_index = 0 cfg = cellbox.config.Config(experiment_config_path) + cfg.drug_index = drug_index cfg.ckpt_path_full = os.path.join('./', cfg.ckpt_name) md5 = cellbox.utils.md5(cfg) - cfg.drug_index = 5 # Change this for testing purposes cfg.seed = working_index + cfg.seed if hasattr(cfg, "seed") else working_index + 1000 set_seed(cfg.seed) - print(vars(cfg)) + #print(vars(cfg)) prepare_workdir(cfg, working_index, md5) logger = cellbox.utils.TimeLogger(time_logger_step=1, hierachy=3) @@ -99,7 +98,7 @@ def get_dataloader(experiment_config_path, tensorflow_code=True): #model = cellbox.model.factory(args) dataloader_list.append(data_dict) - return dataloader_list + return dataloader_list, cfg def yield_data_from_tensorflow_dataloader(dataloader, feed_dict): @@ -147,4 +146,36 @@ def s2c_row_inds(loo_label_dir): loo_label = pd.read_csv(loo_label_dir, header=None) rows_with_single_drugs = loo_label.index[(loo_label[[0, 1]] == 0).any(axis=1)].tolist() rows_with_multiple_drugs = list(set(list(range(loo_label.shape[0]))) - set(rows_with_single_drugs)) - return rows_with_single_drugs, rows_with_multiple_drugs \ No newline at end of file + return rows_with_single_drugs, rows_with_multiple_drugs + + +def loo_row_inds(loo_label_dir, cfg): + """ + Identify the rows of the dataset that leaves out one specific drug + There is some complication in this function, check https://github.com/sanderlab/CellBox/issues/48 + """ + drug_indices_map = [] + for drug_index in range(14): + double_idx = cfg.loo.all(axis=1) + testidx = (cfg.loo == drug_index).any(axis=1) + + if cfg.experiment_type == 'leave one out (w/o single)': + singles = False + elif cfg.experiment_type == 'leave one out (w/ single)': + singles = True + + if singles: + testidx = pd.concat([testidx, double_idx], axis=1) + testidx = testidx.all(axis=1) + + loo_label = pd.read_csv(loo_label_dir, header=None)[testidx] + for i in range(14): + if (loo_label == i).any(axis=1).all(): + drug_indices_map.append(i) + break + + print(f"Drug indices map: {drug_indices_map}") + true_drug_index = drug_indices_map[cfg.drug_index] + loo_label = pd.read_csv(loo_label_dir, header=None) + ind_rows = loo_label.index[(loo_label[[0, 1]] == true_drug_index).any(axis=1)].tolist() + return np.array(ind_rows) \ No newline at end of file