Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix tests with reqs #567

Merged
merged 3 commits into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker/vm_boot_images/config/tensorflow-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ pydot
nibabel==4.0.2
pydicom==1.2.2
hyperopt==0.1.2
seaborn==0.11.2
seaborn
scikit-image
biosppy
vtk==9.2.6
Expand Down
124 changes: 61 additions & 63 deletions notebooks/ML4H_Model_Factory_Intro.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"import os\n",
"import sys\n",
"import pickle\n",
"import random\n",
"import gzip\n",
"from typing import List, Dict, Callable\n",
"from collections import defaultdict, Counter\n",
"\n",
Expand Down Expand Up @@ -57,7 +57,7 @@
"outputs": [],
"source": [
"# Constants\n",
"HD5_FOLDER = './tensors/'\n",
"HD5_FOLDER = './mnist_tensors/'\n",
"OUTPUT_FOLDER = './outputs/'"
]
},
Expand Down Expand Up @@ -119,35 +119,48 @@
"metadata": {},
"outputs": [],
"source": [
"DNA_SYMBOLS = {'A': 0, 'C': 1, 'G': 2, 'T': 3}\n",
"VARIANT_LABELS = {'NOT_SNP': 0, 'NOT_INDEL': 1, 'SNP': 2, 'INDEL': 3}\n",
"def load_data(dataset):\n",
" ''' Loads the dataset\n",
" :param dataset: the path to the dataset (here MNIST)'''\n",
" data_dir, data_file = os.path.split(dataset)\n",
" if data_dir == \"\" and not os.path.isfile(dataset):\n",
" # Check if dataset is in the data directory.\n",
" new_path = os.path.join(\"data\", dataset)\n",
" if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':\n",
" dataset = new_path\n",
"\n",
" if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':\n",
" from urllib.request import urlretrieve\n",
" origin = ('http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz')\n",
" print('Downloading data from %s' % origin)\n",
" if not os.path.exists(os.path.dirname(dataset)):\n",
" os.makedirs(os.path.dirname(dataset))\n",
" urlretrieve(origin, dataset)\n",
"\n",
"def tensor_from_hd5(tm: TensorMap, hd5: h5py.File, dependents: Dict = {}) -> np.ndarray:\n",
" return np.array(hd5[tm.name])\n",
" print('loading data...')\n",
" f = gzip.open(dataset, 'rb')\n",
" if sys.version_info[0] == 3:\n",
" u = pickle._Unpickler(f)\n",
" u.encoding = 'latin1'\n",
" train_set, valid_set, test_set = u.load()\n",
" else:\n",
" train_set, valid_set, test_set = pickle.load(f)\n",
" f.close()\n",
"\n",
" return train_set, valid_set, test_set\n",
"\n",
"reference = TensorMap('reference', shape=(128, len(DNA_SYMBOLS)), tensor_from_file=tensor_from_hd5)\n",
"read_tensor = TensorMap('read_tensor', shape=(128, 128, 15), tensor_from_file=tensor_from_hd5)\n",
"\n",
"\n",
"def variant_label_from_hd5(tm: TensorMap, hd5: h5py.File, dependents: Dict = {}) -> np.ndarray:\n",
" one_hot = np.zeros(tm.shape, dtype=np.float32)\n",
" variant_str = str(hd5['variant_label'][()], 'utf-8')\n",
" for channel in tm.channel_map:\n",
" if channel.lower() == variant_str.lower():\n",
" one_hot[tm.channel_map[channel]] = 1.0\n",
" if one_hot.sum() != 1:\n",
" raise ValueError(f'TensorMap {tm.name} missing or invalid label: {variant_str} one_hot: {one_hot}')\n",
" return one_hot\n",
"\n",
"\n",
"variant_label = TensorMap(\n",
" 'variant_label', Interpretation.CATEGORICAL,\n",
" shape=(len(VARIANT_LABELS),),\n",
" tensor_from_file=variant_label_from_hd5,\n",
" channel_map=VARIANT_LABELS,\n",
")"
"def mnist_as_hd5(hd5_folder):\n",
" train, _, _ = load_data('mnist.pkl.gz')\n",
" mnist_images = train[0].reshape((-1, 28, 28, 1))\n",
" if not os.path.exists(hd5_folder):\n",
" os.makedirs(hd5_folder)\n",
" for i, mnist_image in enumerate(mnist_images):\n",
" with h5py.File(os.path.join(hd5_folder, f'{i}.hd5'), 'w') as hd5:\n",
" hd5.create_dataset('mnist_image', data=mnist_image)\n",
" hd5.create_dataset('mnist_label', data=[train[1][i]])\n",
" if (i+1) % 5000 == 0:\n",
" print(f'Wrote {i+1} MNIST images and labels as HD5 files')"
]
},
{
Expand All @@ -163,9 +176,7 @@
"metadata": {},
"outputs": [],
"source": [
"if not os.path.exists(HD5_FOLDER):\n",
" os.makedirs(HD5_FOLDER)\n",
"!tar -zxvf ./hg002_na24385_ml4h_tensors_v2021_10_14.tar.gz -C ./tensors/"
"mnist_as_hd5(HD5_FOLDER)"
]
},
{
Expand All @@ -183,14 +194,6 @@
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1D CNN for Classification of Genomic Variants\n",
"Jupyter is great, but can complicate productionizing code. We try to mitigate this by interacting with the jupyter notebook as if it were a command line call to one of ml4h's modes. "
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -199,31 +202,26 @@
"source": [
"sys.argv = ['train', \n",
" '--tensors', HD5_FOLDER, \n",
" '--input_tensors', 'gatk.reference',\n",
" '--output_tensors', 'gatk.variant_label',\n",
" '--input_tensors', 'mnist.mnist_image',\n",
" '--output_tensors', 'mnit.mnist_label',\n",
" '--batch_size', '16',\n",
" '--epochs', '12',\n",
" '--output_folder', OUTPUT_FOLDER,\n",
" '--id', 'learn_1d_cnn'\n",
" '--id', 'learn_2d_cnn'\n",
" ]\n",
"args = parse_args()\n",
"metrics = train_multimodal_multitask(args)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sys.argv = ['train', \n",
" '--tensors', HD5_FOLDER, \n",
" '--input_tensors', 'gatk.read_tensor',\n",
" '--output_tensors', 'gatk.variant_label',\n",
"metrics = train_multimodal_multitask(args)\n",
"\n",
"sys.argv = ['train',\n",
" '--tensors', HD5_FOLDER,\n",
" '--input_tensors', 'mnist.mnist_image',\n",
" '--output_tensors', 'mnit.mnist_label',\n",
" '--activation', 'mish',\n",
" '--dense_blocks', '64', '64', '64',\n",
" '--batch_size', '16',\n",
" '--epochs', '12',\n",
" '--output_folder', OUTPUT_FOLDER,\n",
" '--id', 'learn_2d_cnn'\n",
" '--id', 'learn_2d_cnn2'\n",
" ]\n",
"args = parse_args()\n",
"metrics = train_multimodal_multitask(args)"
Expand All @@ -244,12 +242,12 @@
"source": [
"sys.argv = ['compare_scalar', \n",
" '--tensors', HD5_FOLDER, \n",
" '--input_tensors', 'gatk.reference', 'gatk.read_tensor',\n",
" '--output_tensors', 'gatk.variant_label',\n",
" '--id', 'gatk_model_comparison',\n",
" '--input_tensors', 'mnist.mnist_image',\n",
" '--output_tensors', 'mnit.mnist_label',\n",
" '--id', 'mnist_model_comparison',\n",
" '--output_folder', OUTPUT_FOLDER,\n",
" '--model_files', f'{OUTPUT_FOLDER}learn_1d_cnn/learn_1d_cnn.h5',\n",
" f'{OUTPUT_FOLDER}learn_2d_cnn/learn_2d_cnn.h5',\n",
" '--model_files', f'{OUTPUT_FOLDER}learn_2d_cnn/learn_2d_cnn.h5',\n",
" f'{OUTPUT_FOLDER}learn_2d_cnn2/learn_2d_cnn2.h5',\n",
" '--test_steps', '100', \n",
" '--batch_size', '16',\n",
" ]\n",
Expand All @@ -275,8 +273,8 @@
"source": [
"sys.argv = ['train', \n",
" '--tensors', HD5_FOLDER, \n",
" '--input_tensors', 'gatk.reference',\n",
" '--output_tensors', 'gatk.variant_label',\n",
" '--input_tensors', 'mnist.mnist_image',\n",
" '--output_tensors', 'mnit.mnist_label',\n",
" '--output_folder', OUTPUT_FOLDER,\n",
" '--activation', 'swish',\n",
" '--conv_layers', '32',\n",
Expand All @@ -289,7 +287,7 @@
" '--inspect_model',\n",
" '--epochs', '1',\n",
" '--batch_size', '4',\n",
" '--id', 'hypertuned_1d',\n",
" '--id', 'hypertuned_2d',\n",
" ]\n",
"args = parse_args()\n",
"generate_train, generate_valid, generate_test = test_train_valid_tensor_generators(**args.__dict__)\n",
Expand All @@ -300,7 +298,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"After running the cell above the diagram of the model architecture will be saved at: `./outputs/hypertuned_1d/architecture_graph_hypertuned_1d.png`"
"After running the cell above the diagram of the model architecture will be saved at: `./outputs/hypertuned_2d/architecture_graph_hypertuned_2d.png`"
]
}
],
Expand Down
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

here = pathlib.Path(__file__).parent.resolve()
# Get the requirements from the requirements file
requirements = (here / 'docker/vm_boot_images/config/tensorflow-requirements.txt').read_text(encoding='utf-8')
long_description = (here / 'README.md').read_text(encoding='utf-8')
setup(
name='ml4h',
Expand All @@ -12,6 +13,7 @@
long_description_content_type='text/markdown',
url='https://github.com/broadinstitute/ml4h',
python_requires='>=3.6',
install_requires=["ml4ht", "tensorflow", "pytest", "numcodecs"], # requirements
#install_requires=["ml4ht", "tensorflow", "pytest", "numcodecs"], # requirements
install_requires=requirements,
packages=find_packages(),
)
Loading