Skip to content

Commit

Permalink
Add test for s2c and loo
Browse files Browse the repository at this point in the history
  • Loading branch information
Phuc Nguyen committed Jun 15, 2023
1 parent 45a9394 commit 3504a20
Show file tree
Hide file tree
Showing 4 changed files with 174 additions and 38 deletions.
1 change: 0 additions & 1 deletion cellbox/cellbox/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
def factory(cfg):
"""formulate training dataset"""
# Prepare data
print("Hello!")
if cfg.sparse_data:
cfg.pert_in = tf.compat.v1.sparse.placeholder(tf.float32, [None, cfg.n_x], name='pert_in')
cfg.expr_out = tf.compat.v1.sparse.placeholder(tf.float32, [None, cfg.n_x], name='expr_out')
Expand Down
115 changes: 91 additions & 24 deletions notebooks/dataloader.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,52 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2023-06-14 23:54:04.431174: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA\n",
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"2023-06-14 23:54:04.628344: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
"2023-06-14 23:54:04.634777: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cm/shared/apps/lsf10/10.1/linux3.10-glibc2.17-x86_64/lib:/data/weirauchlab/opt/lib:/data/weirauchlab/opt/lib64:/data/weirauchlab/local/lib\n",
"2023-06-14 23:54:04.634820: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n",
"2023-06-14 23:54:09.211164: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cm/shared/apps/lsf10/10.1/linux3.10-glibc2.17-x86_64/lib:/data/weirauchlab/opt/lib:/data/weirauchlab/opt/lib64:/data/weirauchlab/local/lib\n",
"2023-06-14 23:54:09.212455: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cm/shared/apps/lsf10/10.1/linux3.10-glibc2.17-x86_64/lib:/data/weirauchlab/opt/lib:/data/weirauchlab/opt/lib64:/data/weirauchlab/local/lib\n",
"2023-06-14 23:54:09.212486: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From /users/ngun7t/anaconda3/envs/cellbox-3.6-2/lib/python3.8/site-packages/tensorflow/python/compat/v2_compat.py:107: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"non-resource variables are not supported in the long term\n",
"================================================================================\n",
" _____ _ _ ____ \n",
" / ____| | | | _ \\ \n",
" | | ___| | | |_) | _____ __ \n",
" | | / _ \\ | | _ < / _ \\ \\/ / \n",
" | |___| __/ | | |_) | (_) > < \n",
" \\_____\\___|_|_|____/ \\___/_/\\_\\ \n",
"Running CellBox scripts developed in Sander lab\n",
"Maintained by Bo Yuan, Judy Shen, and Augustin Luna; contributions by Daniel Ritter\n",
"\n",
" version 0.3.2\n",
" -- Feb 10, 2023 --\n",
" * Modify CellBox to support TF2 \n",
" \n",
"Tutorials and documentations are available at https://github.com/sanderlab/CellBox\n",
"If you want to discuss the usage or to report a bug, please use the 'Issues' function at GitHub.\n",
"If you find CellBox useful for your research, please consider citing the corresponding publication.\n",
"For more information, please email us at [email protected] and [email protected], [email protected]\n",
" --------------------------------------------------------------------------------\n"
]
}
],
"source": [
"import cellbox\n",
"import os\n",
Expand All @@ -35,15 +78,17 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'experiment_id': 'Example_S2C', 'model_prefix': 'seed', 'ckpt_name': 'model11.ckpt', 'export_verbose': 3, 'experiment_type': 'single to combo', 'sparse_data': False, 'batchsize': 16, 'trainset_ratio': 0.7, 'validset_ratio': 0.8, 'n_batches_eval': None, 'add_noise_level': 0, 'dT': 0.1, 'ode_solver': 'heun', 'envelope_form': 'tanh', 'envelope': 0, 'pert_form': 'by u', 'ode_degree': 1, 'ode_last_steps': 2, 'n_iter_buffer': 50, 'n_iter_patience': 100, 'weight_loss': 'None', 'l1lambda': 0.0001, 'l2lambda': 0.0001, 'model': 'CellBox', 'pert_file': '/users/ngun7t/Documents/cellbox-jun-6/data/pert_ind_last_col.csv', 'expr_file': '/users/ngun7t/Documents/cellbox-jun-6/data/expr_ind_last_col.csv', 'node_index_file': '/users/ngun7t/Documents/cellbox-jun-6/data/node_Index.csv', 'n_protein_nodes': 82, 'n_activity_nodes': 87, 'n_x': 100, 'envelop_form': 'tanh', 'envelop': 0, 'n_epoch': 10000, 'n_iter': 10000, 'stages': [{'nT': 100, 'sub_stages': [{'lr_val': 0.1, 'l1lambda': 0.01, 'n_iter_patience': 1000}, {'lr_val': 0.01, 'l1lambda': 0.01}, {'lr_val': 0.01, 'l1lambda': 0.0001}, {'lr_val': 0.001, 'l1lambda': 1e-05}]}, {'nT': 200, 'sub_stages': [{'lr_val': 0.001, 'l1lambda': 0.0001}]}, {'nT': 400, 'sub_stages': [{'lr_val': 0.001, 'l1lambda': 0.0001}]}], 'ckpt_path_full': './model11.ckpt', 'drug_index': 5, 'seed': 1000}\n",
"Working directory is ready at results/Example_S2C_66d9ccfc914f27490ca1b771e339ab37.\n",
"{'experiment_id': 'Example_LOO', 'model_prefix': 'drug', 'ckpt_name': 'model11.ckpt', 'export_verbose': 3, 'experiment_type': 'leave one out (w/o single)', 'sparse_data': False, 'batchsize': 16, 'trainset_ratio': 0.7, 'validset_ratio': 0.8, 'n_batches_eval': None, 'add_noise_level': 0, 'dT': 0.1, 'ode_solver': 'heun', 'envelope_form': 'tanh', 'envelope': 0, 'pert_form': 'by u', 'ode_degree': 1, 'ode_last_steps': 2, 'n_iter_buffer': 50, 'n_iter_patience': 100, 'weight_loss': 'None', 'l1lambda': 0.0001, 'l2lambda': 0.0001, 'model': 'CellBox', 'pert_file': '/users/ngun7t/Documents/cellbox-jun-6/data/pert_ind_last_col.csv', 'expr_file': '/users/ngun7t/Documents/cellbox-jun-6/data/expr_ind_last_col.csv', 'node_index_file': '/users/ngun7t/Documents/cellbox-jun-6/data/node_Index.csv', 'n_protein_nodes': 82, 'n_activity_nodes': 87, 'n_x': 100, 'envelop_form': 'tanh', 'envelop': 0, 'n_epoch': 10000, 'n_iter': 10000, 'stages': [{'nT': 100, 'sub_stages': [{'lr_val': 0.1, 'l1lambda': 0.01, 'n_iter_patience': 1000}, {'lr_val': 0.01, 'l1lambda': 0.01}, {'lr_val': 0.01, 'l1lambda': 0.0001}, {'lr_val': 0.001, 'l1lambda': 1e-05}]}, {'nT': 200, 'sub_stages': [{'lr_val': 0.001, 'l1lambda': 0.0001}]}, {'nT': 400, 'sub_stages': [{'lr_val': 0.001, 'l1lambda': 0.0001}]}], 'ckpt_path_full': './model11.ckpt', 'drug_index': 4, 'seed': 1000}\n",
"Working directory is ready at results/Example_LOO_a7102a7e8a4ad3c23e9eca13cab65b6f.\n",
"Hello!\n",
"Hello!\n",
"Hello!\n"
]
}
Expand Down Expand Up @@ -93,7 +138,7 @@
" print('Working directory is ready at {}.'.format(experiment_path))\n",
" return 0\n",
"\n",
"experiment_config_path = \"/users/ngun7t/Documents/cellbox-jun-6/configs_dev/Example.single_to_combo.json\"\n",
"experiment_config_path = \"/users/ngun7t/Documents/cellbox-jun-6/configs_dev/Example.leave_one_drug_out.json\"\n",
"working_index = 0\n",
"stage = {\n",
" \"nT\": 100,\n",
Expand All @@ -107,7 +152,7 @@
"cfg = cellbox.config.Config(experiment_config_path)\n",
"cfg.ckpt_path_full = os.path.join('./', cfg.ckpt_name)\n",
"md5 = cellbox.utils.md5(cfg)\n",
"cfg.drug_index = 5 # Change this for testing purposes\n",
"cfg.drug_index = 4 # Change this for testing purposes\n",
"cfg.seed = working_index + cfg.seed if hasattr(cfg, \"seed\") else working_index + 1000\n",
"set_seed(cfg.seed)\n",
"print(vars(cfg))\n",
Expand All @@ -121,21 +166,21 @@
" #args.sub_stages = stage['sub_stages']\n",
" #args.n_T = stage['nT']\n",
" #model = cellbox.model.factory(args)\n",
" if i == 0: break"
" if i == 2: break"
]
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(18, 100)\n",
"(5, 100)\n",
"(66, 100)\n",
"(60, 100)\n",
"(16, 100)\n",
"(13, 100)\n",
"<class 'numpy.ndarray'>\n"
]
}
Expand All @@ -162,7 +207,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -187,7 +232,7 @@
" true_drug_index = drug_indices_map[drug_index]\n",
" loo_label = pd.read_csv(\"/users/ngun7t/Documents/cellbox-jun-6/data/loo_label.csv\", header=None)\n",
" ind_rows = loo_label.index[(loo_label[[0, 1]] == true_drug_index).any(axis=1)].tolist()\n",
" return np.array(ind_rows) - 1\n",
" return np.array(ind_rows)\n",
"\n",
"drug_indices_map = populate_drug_indices_map()\n",
"if cfg.experiment_type == \"leave one out (w/o single)\":\n",
Expand All @@ -198,7 +243,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 10,
"metadata": {},
"outputs": [
{
Expand All @@ -207,7 +252,7 @@
"[0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 8, 9]"
]
},
"execution_count": 5,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -241,7 +286,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 33,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -280,16 +325,16 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2"
"4"
]
},
"execution_count": 23,
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -300,22 +345,44 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 2, 15, 25, 29, 30, 31, 32, 33, 36, 45, 58, 72, 81])"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"inds"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ind: [41. 76. 11. 34. 24. 66. 23. 29. 42. 70. 79. 43. 51. 69. 54. 53.] shares the ind that contains the drug index: [11. 23. 24. 29. 34. 41. 42. 43. 51. 53. 54. 66. 69. 70. 76. 79.]\n",
"Ind: [88. 12.] shares the ind that contains the drug index: [12. 88.]\n"
"Ind: [48. 82. 63. 67. 7. 28. 64. 9. 20. 74. 80. 12. 83. 11. 37. 41.] shares the ind that contains the drug index: []\n",
"Ind: [ 8. 6. 78. 18. 77. 60. 66. 56. 59. 68. 65. 70. 13. 86. 44. 3.] shares the ind that contains the drug index: []\n",
"Ind: [85. 54. 23. 49. 43. 4. 57. 26. 35. 40. 17. 88. 16. 39. 75. 10.] shares the ind that contains the drug index: []\n",
"Ind: [24. 19. 14. 27. 53. 34. 47. 5. 87. 21. 42. 38.] shares the ind that contains the drug index: []\n"
]
}
],
"source": [
"for pert in items_pert:\n",
" ind = pert[:, -1]\n",
" print(f\"Ind: {ind} shares the ind that contains the drug index: {np.intersect1d(ind, rows_with_single_drugs)}\")\n",
" print(f\"Ind: {ind} shares the ind that contains the drug index: {np.intersect1d(ind, inds)}\")\n",
" \n",
"#cfg.drug_index"
]
Expand Down
53 changes: 46 additions & 7 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,14 @@
import numpy as np

from test_utils.dataloader import get_dataloader, yield_data_from_tensorflow_dataloader, yield_data_from_pytorch_dataloader, \
s2c_row_inds
s2c_row_inds, loo_row_inds


#def test_model():
# os.system('python scripts/main.py -config=configs/Example.minimal.json')
# files = glob.glob('results/Debugging_*/seed_000/3_best.W*')
# assert False


#################################################### Tests for DataLoaders ####################################################

# Test for correct shape
Expand All @@ -21,8 +20,8 @@ def test_correct_shape():
A function to test if the batch yielded by both Tensorflow and Pytorch has the same shape
"""
experiment_config_path = "/users/ngun7t/Documents/cellbox-jun-6/configs_dev/Example.random_partition.json"
tensorflow_dataloader_list = get_dataloader(experiment_config_path, tensorflow_code=True)
pytorch_dataloader_list = get_dataloader(experiment_config_path, tensorflow_code=False)
tensorflow_dataloader_list, cfg = get_dataloader(experiment_config_path, tensorflow_code=True)
pytorch_dataloader_list, cfg = get_dataloader(experiment_config_path, tensorflow_code=False)

# Code to extract the shape of each yield
for tf_dict, torch_dict in zip(tensorflow_dataloader_list, pytorch_dataloader_list):
Expand All @@ -47,15 +46,15 @@ def test_correct_shape():
assert tf_arr.shape == np.array(torch_arr).shape, f"For expr batches, shape of tf batch = {tf_arr.shape} is not equal to shape of torch batch = {np.array(torch_arr).shape}"


# Test for correct input
# Test for correct input rows for single-to-combo
def test_single_to_combo():
"""
A function to test if pytorch and tensorflow dataloaders yield the correct rows in the dataset for s2c experiment
"""
experiment_config_path = "/users/ngun7t/Documents/cellbox-jun-6/configs_dev/Example.single_to_combo.json"
loo_label_dir = "/users/ngun7t/Documents/cellbox-jun-6/data/loo_label.csv"
tensorflow_dataloader_list = get_dataloader(experiment_config_path, tensorflow_code=True)
pytorch_dataloader_list = get_dataloader(experiment_config_path, tensorflow_code=False)
tensorflow_dataloader_list, cfg = get_dataloader(experiment_config_path, tensorflow_code=True)
pytorch_dataloader_list, cfg = get_dataloader(experiment_config_path, tensorflow_code=False)

# Get the row index that contains single drugs
rows_with_single_drugs, rows_with_multiple_drugs = s2c_row_inds(loo_label_dir)
Expand Down Expand Up @@ -84,6 +83,46 @@ def test_single_to_combo():
assert tf_arr.shape == np.array(torch_arr).shape, f"For expr batches, shape of tf batch = {tf_arr.shape} is not equal to shape of torch batch = {np.array(torch_arr).shape}"
assert np.intersect1d(tf_arr[:, -1], rows_with_multiple_drugs).size == 0, f"batches for tf train set contains data rows that has multiple drugs in s2c mode"
assert np.intersect1d(torch_arr[:, -1], rows_with_multiple_drugs).size == 0, f"batches for torch train set contains data rows that has multiple drugs in s2c mode"


# Test for correct input rows for leave-one-out
@pytest.mark.parametrize("drug_index", list(range(0, 13)))
def test_leave_one_out(drug_index):
"""
A function to test if pytorch and tensorflow dataloaders yield the correct rows in the dataset for leave-one-out experiment
"""
experiment_config_path = "/users/ngun7t/Documents/cellbox-jun-6/configs_dev/Example.leave_one_drug_out.json"
loo_label_dir = "/users/ngun7t/Documents/cellbox-jun-6/data/loo_label.csv"
tensorflow_dataloader_list, cfg = get_dataloader(experiment_config_path, tensorflow_code=True, drug_index=drug_index)
pytorch_dataloader_list, cfg = get_dataloader(experiment_config_path, tensorflow_code=False, drug_index=drug_index)

# Get the row index that contains single drugs
rows_with_left_out_drugs = loo_row_inds(loo_label_dir, cfg)

# Code to extract the shape of each yield
for tf_dict, torch_dict in zip(tensorflow_dataloader_list, pytorch_dataloader_list):
tf_train_pert, tf_train_expr = yield_data_from_tensorflow_dataloader(
dataloader=tf_dict["iter_train"],
feed_dict=tf_dict["feed_dict"]
)
torch_train_pert, torch_train_expr = yield_data_from_pytorch_dataloader(
dataloader=torch_dict["iter_train"]
)
# Assert that the count of batches obtained is equal
assert len(tf_train_pert) == len(torch_train_pert), "Length of number of arrays yield for train pert not equal"
assert len(tf_train_expr) == len(torch_train_expr), "Length of number of arrays yield for train expr not equal"

# Assert that the shape of each batch is equal, and also it contains the correct row index
for tf_arr, torch_arr in zip(tf_train_pert, torch_train_pert):
assert tf_arr.shape == np.array(torch_arr).shape, f"For pert batches, shape of tf batch = {tf_arr.shape} is not equal to shape of torch batch = {np.array(torch_arr).shape}"
assert np.intersect1d(tf_arr[:, -1], rows_with_left_out_drugs).size == 0, f"batches for tf train set contains data rows that has left-out drugs in loo mode"
assert np.intersect1d(torch_arr[:, -1], rows_with_left_out_drugs).size == 0, f"batches for torch train set contains data rows that has left-out drugs in loo mode"

# Assert that the shape of each batch is equal
for tf_arr, torch_arr in zip(tf_train_expr, torch_train_expr):
assert tf_arr.shape == np.array(torch_arr).shape, f"For expr batches, shape of tf batch = {tf_arr.shape} is not equal to shape of torch batch = {np.array(torch_arr).shape}"
assert np.intersect1d(tf_arr[:, -1], rows_with_left_out_drugs).size == 0, f"batches for tf train set contains data rows that has left-out drugs in loo mode"
assert np.intersect1d(torch_arr[:, -1], rows_with_left_out_drugs).size == 0, f"batches for torch train set contains data rows that has left-out drugs in loo mode"


if __name__ == '__main__':
Expand Down
Loading

0 comments on commit 3504a20

Please sign in to comment.