Skip to content

Commit

Permalink
Merge pull request #2 from NCAR/ggantos
Browse files Browse the repository at this point in the history
updating data.py for simplicity and z-mass and running 3particle update
  • Loading branch information
djgagne authored Jul 28, 2020
2 parents 4d439ab + e6957f6 commit 0d9e150
Show file tree
Hide file tree
Showing 9 changed files with 662 additions and 93 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@
*.DS_Store
**/.ipynb_checkpoints
*.out
batch**.sh
*.o
11 changes: 6 additions & 5 deletions config/1particle.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
data_path: "/glade/p/cisl/aiml/ai4ess_hackathon/holodec/"
out_path: "/glade/p/cisl/aiml/ggantos/holodec/conv2d_1particle/"
path_data: "/glade/p/cisl/aiml/ai4ess_hackathon/holodec/"
path_save: "/glade/p/cisl/aiml/ggantos/holodec/xyz3/base1/"
model_name: "cnn"
num_particles: 1
random_seed: 328942
output_cols: ["x", "y", "z", "d"]
input_scaler: "MinMaxScaler"
scaler_out: "MinMaxScaler"
num_z_bins: False
subset: False
metric: "mae"
conv2d_network:
filters: [8, 16, 32]
Expand All @@ -16,7 +18,6 @@ conv2d_network:
lr: 0.001
optimizer: "adam"
loss: "mae"
batch_size: 32
batch_size: 256
epochs: 20
verbose: 1

24 changes: 24 additions & 0 deletions config/3particle.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
path_data: "/glade/p/cisl/aiml/ai4ess_hackathon/holodec/"
path_save: "/glade/p/cisl/aiml/ggantos/holodec/z3_flatten/base/"
model_name: "cnn"
num_particles: 3
random_seed: 328942
output_cols: ["z","hid"]
scaler_out: "MinMaxScaler"
num_z_bins: False
flatten_coord: True
subset: False
metric: "mae"
conv2d_network:
filters: [8, 16, 32]
kernel_sizes: [5, 5, 5]
conv2d_activation: "relu"
pool_sizes: [4, 4, 4]
dense_sizes: [64, 32]
dense_activation: "relu"
lr: 0.001
optimizer: "adam"
loss: "mae"
batch_size: 256
epochs: 20
verbose: 1
2 changes: 1 addition & 1 deletion get_data.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
#!/bin/bash
wget ftp://ftp.ucar.edu/pub/mmm/bansemer/holodec/synthetic_holograms_v01.nc
wget ftp://ftp.ucar.edu/pub/mmm/bansemer/holodec/project_datasets/synthetic_holograms_v01.nc
221 changes: 193 additions & 28 deletions library/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import xarray as xr
import numpy as np
import pandas as pd

from datetime import datetime


num_particles_dict = {
Expand All @@ -15,51 +15,216 @@
'test' : 'test',
'valid': 'validation'}

def dataset_name(num_particles, split):
"""Return the dataset filename given user inputs"""
def dataset_name(num_particles, split, file_extension='nc'):
"""
Return the dataset filename given user inputs
Args:
num_particles: (int or str) Number of particles per hologram
split: (str) Dataset split of either 'train', 'valid', or 'test'
file_extension: (str) Dataset file extension
Returns:
ds_name: (str) Dataset name
"""

valid = [1,3,'multi']
if num_particles not in valid:
raise ValueError("results: num_particles must be one of %r." % valid)
num_particles = num_particles_dict[num_particles]

valid = ['train','test','valid']
if split not in valid:
raise ValueError("results: split must be one of %r." % valid)
split = split_dict[split]
ds_name = f'synthetic_holograms_{num_particles}_{split}.{file_extension}'

return f'synthetic_holograms_{num_particles}_{split}.nc'
return ds_name

def open_dataset(data_path, num_particles, split):
"""Return xarray dataset given user inputs"""
data_path = os.path.join(data_path, dataset_name(num_particles, split))
ds = xr.open_dataset(data_path)
def open_dataset(path_data, num_particles, split):
"""
Opens a HOLODEC file
Args:
path_data: (str) Path to dataset directory
num_particles: (int or str) Number of particles per hologram
split: (str) Dataset split of either 'train', 'valid', or 'test'
Returns:
ds: (xarray Dataset) Opened dataset
"""
path_data = os.path.join(path_data, dataset_name(num_particles, split))
ds = xr.open_dataset(path_data)
return ds

def scale_images(images):
"""Return images with pixel values between 0 and 1"""
return images.astype(np.float16)/255.
def load_raw_datasets(path_data, num_particles, split, subset, output_cols):
"""
Given a path to training or validation datset, the number of particles per
hologram, and output columns, returns raw inputs and outputs. Can specify
a subset of the full dataset.
Args:
path_data: (str) Path to dataset directory
num_particles: (int or str) Number of particles per hologram
split: (str) Dataset split of either 'train', 'valid', or 'test'
subset: (float) Fraction of data to be loaded
output_cols: (list of strings) List of feature columns
Returns:
inputs: (np array) Input image data
outputs: (df) Output data specified by output_cols
"""

ds = open_dataset(path_data, num_particles, split)
if subset:
in_ix = int(subset * ds['image'].shape[0])
out_ix = int(in_ix * (ds['hid'].shape[0]/ds['image'].shape[0]))
inputs = ds['image'][:in_ix].values
outputs = ds[output_cols].sel(particle=slice(0,out_ix)).to_dataframe()
else:
inputs = ds["image"].values
outputs = ds[output_cols].to_dataframe()
ds.close()
return inputs, outputs

def scale_images(images, scaler_in=None):
"""
Takes in array of images and scales pixel values between 0 and 1
Args:
images: (np array) Input image data
scaler_in: (dict) Image scaler 'max' and 'min' values
Returns:
images_scaled: (np array) Input image data scaled between 0 and 1
scaler_in: (dict) Image scaler 'max' and 'min' values
"""

if scaler_in is None:
scaler_in = {}
scaler_in["min"] = images.min()
scaler_in["max"] = images.max()
images_scaled = (images.astype(np.float32) - scaler_in["min"])
images_scaled /= (scaler_in["max"] - scaler_in["min"])

return images_scaled, scaler_in

def load_scaled_datasets(data_path, num_particles, output_cols, input_scaler):
"""Given the dataset particle numbers, returns scaled training and validation xarrays."""
def calc_z_relative_mass(outputs, num_z_bins=20, z_bins=None):
"""
Calculate z-relative mass from particle data.
print("Loading training and validation data")
xr_train = open_dataset(data_path, num_particles, 'train')
xr_valid = open_dataset(data_path, num_particles, 'valid')
Args:
outputs: (df) Output data specified by output_col
num_z_bins: (int) Number of bins for z-axis linspace
z_bins: (np array) Bin linspace along the z-axis
print("Scaling output data")
train_outputs = xr_train[output_cols].to_dataframe()
valid_outputs = xr_valid[output_cols].to_dataframe()
Returns:
z_mass: (np array) Particle mass distribution by hologram along z-axis
z_bins: (np array) Bin linspace along the z-axis
"""

scaled_train_outputs = pd.DataFrame(input_scaler.fit_transform(train_outputs),
index=train_outputs.index, columns=train_outputs.columns)
if z_bins is None:
z_bins = np.linspace(outputs["z"].min() - 100,
outputs["z"].max() + 100,
num_z_bins)
else:
num_z_bins = z_bins.size
holograms = len(outputs["hid"].unique())
z_mass = np.zeros((holograms, num_z_bins), dtype=np.float32)
for i in range(outputs.shape[0]):
z_pos = np.searchsorted(z_bins, outputs.loc[i, "z"], side="right") - 1
mass = 4 / 3 * np.pi * (outputs.loc[i, "d"]/2)**3
z_mass[int(outputs.loc[i, "hid"]) - 1, z_pos] += mass
z_mass /= np.expand_dims(z_mass.sum(axis=1), -1)
return z_mass, z_bins

def calc_z_bins(train_outputs, valid_outputs, num_z_bins):
"""
Calculate z-axis linspace.
scaled_valid_outputs = pd.DataFrame(input_scaler.transform(valid_outputs),
index=valid_outputs.index, columns=valid_outputs.columns)
Args:
train_outputs: (np array) Training output data
valid_outputs: (int) Validation output data
num_z_bins: (int) Bin linspace along the z-axis
print("Scaling input data")
scaled_train_inputs = scale_images(xr_train["image"])
scaled_valid_inputs = scale_images(xr_valid["image"])
Returns:
z_bins: (np array) Bin linspace along the z-axis
"""
z_min = np.minimum(train_outputs["z"].min(), valid_outputs["z"].min())
z_max = np.maximum(train_outputs["z"].max(), valid_outputs["z"].max())
z_bins = np.linspace(z_min, z_max, num_z_bins)
return z_bins

def flatten_coordinate(outputs, hids, output_col):
outputs = pd.DataFrame({output_col: outputs, 'hid': hids})
outputs_flattened = []
for h in np.unique(hids):
outputs_flattened.append(outputs[outputs.hid == h][output_col].values)
outputs = np.array(outputs_flattened, dtype=object)
return outputs

return scaled_train_inputs, scaled_valid_inputs, scaled_train_outputs, scaled_valid_outputs, input_scaler
def load_scaled_datasets(path_data, num_particles, output_cols,
scaler_out=False, subset=False, num_z_bins=False,
flatten_coord=False):
"""
Given a path to training or validation datset, the number of particles per
hologram, and output columns, returns scaled inputs and raw outputs.
Args:
path_data: (str) Path to dataset directory
num_particles: (int or str) Number of particles per hologram
output_cols: (list of strings) List of feature columns
scaler_out: (sklearn.preprocessing scaler) Output data scaler
subset: (float) Fraction of data to be loaded
num_z_bins: (int) Number of bins along z-axis
flatten_coord: (boolean) If True, flatten single coord by hid
Returns:
train_inputs: (np array) Train input data scaled between 0 and 1
train_outputs: (np array) Scaled train output data
valid_inputs: (np array) Valid input data scaled between 0 and 1
valid_outputs: (np array) Scaled valid output data
"""

train_inputs,\
train_outputs = load_raw_datasets(path_data, num_particles, 'train',
subset, output_cols)
valid_inputs,\
valid_outputs = load_raw_datasets(path_data, num_particles, 'valid',
subset, output_cols)

if flatten_coord:
train_hids = train_outputs["hid"].values
valid_hids = valid_outputs["hid"].values
train_outputs = train_outputs.drop(['hid'], axis=1)
valid_outputs = valid_outputs.drop(['hid'], axis=1)

train_inputs, scaler_in = scale_images(train_inputs)
valid_inputs, _ = scale_images(valid_inputs, scaler_in)
train_inputs = np.expand_dims(train_inputs, -1)
valid_inputs = np.expand_dims(valid_inputs, -1)

if num_z_bins:
z_bins = calc_z_bins(train_outputs, valid_outputs, num_z_bins)
train_outputs, _ = calc_z_relative_mass(outputs=train_outputs,
z_bins=z_bins)
valid_outputs, _ = calc_z_relative_mass(outputs=valid_outputs,
z_bins=z_bins)
else:
train_outputs = scaler_out.fit_transform(train_outputs)
valid_outputs = scaler_out.transform(valid_outputs)

if flatten_coord:
output_cols.remove("hid")
train_outputs = flatten_coordinate(train_outputs.flatten(),
train_hids, output_cols[0])
valid_outputs = flatten_coordinate(valid_outputs.flatten(),
valid_hids, output_cols[0])

if train_inputs.shape[0] != train_outputs.shape[0]:
factor = int(train_outputs.shape[0]/train_inputs.shape[0])
train_inputs = np.repeat(train_inputs, factor, axis=0)
factor = int(valid_outputs.shape[0]/valid_inputs.shape[0])
valid_inputs = np.repeat(valid_inputs, factor, axis=0)

return train_inputs, train_outputs, valid_inputs, valid_outputs
Loading

0 comments on commit 0d9e150

Please sign in to comment.