Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updating data.py for simplicity and z-mass and running 3particle update #2

Merged
merged 9 commits into from
Jul 28, 2020
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@
*.DS_Store
**/.ipynb_checkpoints
*.out
batch**.sh
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the reason for the batch**.sh scripts to be ignored by git? Are they autogenerated by another program?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not autogenerated but I figured since they were native to our specific casper environment, we don't necessarily want them on the git repo. What do you think?

*.o
11 changes: 6 additions & 5 deletions config/1particle.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
data_path: "/glade/p/cisl/aiml/ai4ess_hackathon/holodec/"
out_path: "/glade/p/cisl/aiml/ggantos/holodec/conv2d_1particle/"
path_data: "/glade/p/cisl/aiml/ai4ess_hackathon/holodec/"
path_save: "/glade/p/cisl/aiml/ggantos/holodec/xyz3/base1/"
model_name: "cnn"
num_particles: 1
random_seed: 328942
output_cols: ["x", "y", "z", "d"]
input_scaler: "MinMaxScaler"
scaler_out: "MinMaxScaler"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does scaler_out handle the scaling of both the inputs and outputs or just the outputs?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on further review of the code, it looks like it handles the outputs, which is fine.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry, yes, just outputs. I renamed scaler_vals to scaler_in to mimic the naming convention in my latest commit.

num_z_bins: False
subset: False
metric: "mae"
conv2d_network:
filters: [8, 16, 32]
Expand All @@ -16,7 +18,6 @@ conv2d_network:
lr: 0.001
optimizer: "adam"
loss: "mae"
batch_size: 32
batch_size: 256
epochs: 20
verbose: 1

24 changes: 24 additions & 0 deletions config/3particle.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
path_data: "/glade/p/cisl/aiml/ai4ess_hackathon/holodec/"
path_save: "/glade/p/cisl/aiml/ggantos/holodec/z3_flatten/base/"
model_name: "cnn"
num_particles: 3
random_seed: 328942
output_cols: ["z","hid"]
scaler_out: "MinMaxScaler"
num_z_bins: False
flatten_coord: True
subset: False
metric: "mae"
conv2d_network:
filters: [8, 16, 32]
kernel_sizes: [5, 5, 5]
conv2d_activation: "relu"
pool_sizes: [4, 4, 4]
dense_sizes: [64, 32]
dense_activation: "relu"
lr: 0.001
optimizer: "adam"
loss: "mae"
batch_size: 256
epochs: 20
verbose: 1
2 changes: 1 addition & 1 deletion get_data.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
#!/bin/bash
wget ftp://ftp.ucar.edu/pub/mmm/bansemer/holodec/synthetic_holograms_v01.nc
wget ftp://ftp.ucar.edu/pub/mmm/bansemer/holodec/project_datasets/synthetic_holograms_v01.nc
221 changes: 193 additions & 28 deletions library/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import xarray as xr
import numpy as np
import pandas as pd

from datetime import datetime


num_particles_dict = {
Expand All @@ -15,51 +15,216 @@
'test' : 'test',
'valid': 'validation'}

def dataset_name(num_particles, split):
"""Return the dataset filename given user inputs"""
def dataset_name(num_particles, split, file_extension='nc'):
"""
Return the dataset filename given user inputs

Args:
num_particles: (int or str) Number of particles per hologram
split: (str) Dataset split of either 'train', 'valid', or 'test'
file_extension: (str) Dataset file extension

Returns:
ds_name: (str) Dataset name
"""

valid = [1,3,'multi']
if num_particles not in valid:
raise ValueError("results: num_particles must be one of %r." % valid)
num_particles = num_particles_dict[num_particles]

valid = ['train','test','valid']
if split not in valid:
raise ValueError("results: split must be one of %r." % valid)
split = split_dict[split]
ds_name = f'synthetic_holograms_{num_particles}_{split}.{file_extension}'

return f'synthetic_holograms_{num_particles}_{split}.nc'
return ds_name

def open_dataset(data_path, num_particles, split):
"""Return xarray dataset given user inputs"""
data_path = os.path.join(data_path, dataset_name(num_particles, split))
ds = xr.open_dataset(data_path)
def open_dataset(path_data, num_particles, split):
"""
Opens a HOLODEC file

Args:
path_data: (str) Path to dataset directory
num_particles: (int or str) Number of particles per hologram
split: (str) Dataset split of either 'train', 'valid', or 'test'

Returns:
ds: (xarray Dataset) Opened dataset
"""
path_data = os.path.join(path_data, dataset_name(num_particles, split))
ds = xr.open_dataset(path_data)
return ds

def scale_images(images):
"""Return images with pixel values between 0 and 1"""
return images.astype(np.float16)/255.
def load_raw_datasets(path_data, num_particles, split, subset, output_cols):
"""
Given a path to training or validation datset, the number of particles per
hologram, and output columns, returns raw inputs and outputs. Can specify
a subset of the full dataset.

Args:
path_data: (str) Path to dataset directory
num_particles: (int or str) Number of particles per hologram
split: (str) Dataset split of either 'train', 'valid', or 'test'
subset: (float) Fraction of data to be loaded
output_cols: (list of strings) List of feature columns

Returns:
inputs: (np array) Input image data
outputs: (df) Output data specified by output_cols
"""

ds = open_dataset(path_data, num_particles, split)
if subset:
in_ix = int(subset * ds['image'].shape[0])
out_ix = int(in_ix * (ds['hid'].shape[0]/ds['image'].shape[0]))
inputs = ds['image'][:in_ix].values
outputs = ds[output_cols].sel(particle=slice(0,out_ix)).to_dataframe()
else:
inputs = ds["image"].values
outputs = ds[output_cols].to_dataframe()
ds.close()
return inputs, outputs

def scale_images(images, scaler_in=None):
"""
Takes in array of images and scales pixel values between 0 and 1

Args:
images: (np array) Input image data
scaler_in: (dict) Image scaler 'max' and 'min' values

Returns:
images_scaled: (np array) Input image data scaled between 0 and 1
scaler_in: (dict) Image scaler 'max' and 'min' values
"""

if scaler_in is None:
scaler_in = {}
scaler_in["min"] = images.min()
scaler_in["max"] = images.max()
images_scaled = (images.astype(np.float32) - scaler_in["min"])
images_scaled /= (scaler_in["max"] - scaler_in["min"])

return images_scaled, scaler_in

def load_scaled_datasets(data_path, num_particles, output_cols, input_scaler):
"""Given the dataset particle numbers, returns scaled training and validation xarrays."""
def calc_z_relative_mass(outputs, num_z_bins=20, z_bins=None):
"""
Calculate z-relative mass from particle data.

print("Loading training and validation data")
xr_train = open_dataset(data_path, num_particles, 'train')
xr_valid = open_dataset(data_path, num_particles, 'valid')
Args:
outputs: (df) Output data specified by output_col
num_z_bins: (int) Number of bins for z-axis linspace
z_bins: (np array) Bin linspace along the z-axis

print("Scaling output data")
train_outputs = xr_train[output_cols].to_dataframe()
valid_outputs = xr_valid[output_cols].to_dataframe()
Returns:
z_mass: (np array) Particle mass distribution by hologram along z-axis
z_bins: (np array) Bin linspace along the z-axis
"""

scaled_train_outputs = pd.DataFrame(input_scaler.fit_transform(train_outputs),
index=train_outputs.index, columns=train_outputs.columns)
if z_bins is None:
z_bins = np.linspace(outputs["z"].min() - 100,
outputs["z"].max() + 100,
num_z_bins)
else:
num_z_bins = z_bins.size
holograms = len(outputs["hid"].unique())
z_mass = np.zeros((holograms, num_z_bins), dtype=np.float32)
for i in range(outputs.shape[0]):
z_pos = np.searchsorted(z_bins, outputs.loc[i, "z"], side="right") - 1
mass = 4 / 3 * np.pi * (outputs.loc[i, "d"]/2)**3
z_mass[int(outputs.loc[i, "hid"]) - 1, z_pos] += mass
z_mass /= np.expand_dims(z_mass.sum(axis=1), -1)
return z_mass, z_bins

def calc_z_bins(train_outputs, valid_outputs, num_z_bins):
"""
Calculate z-axis linspace.

scaled_valid_outputs = pd.DataFrame(input_scaler.transform(valid_outputs),
index=valid_outputs.index, columns=valid_outputs.columns)
Args:
train_outputs: (np array) Training output data
valid_outputs: (int) Validation output data
num_z_bins: (int) Bin linspace along the z-axis

print("Scaling input data")
scaled_train_inputs = scale_images(xr_train["image"])
scaled_valid_inputs = scale_images(xr_valid["image"])
Returns:
z_bins: (np array) Bin linspace along the z-axis
"""
z_min = np.minimum(train_outputs["z"].min(), valid_outputs["z"].min())
z_max = np.maximum(train_outputs["z"].max(), valid_outputs["z"].max())
z_bins = np.linspace(z_min, z_max, num_z_bins)
return z_bins

def flatten_coordinate(outputs, hids, output_col):
outputs = pd.DataFrame({output_col: outputs, 'hid': hids})
outputs_flattened = []
for h in np.unique(hids):
outputs_flattened.append(outputs[outputs.hid == h][output_col].values)
outputs = np.array(outputs_flattened, dtype=object)
return outputs

return scaled_train_inputs, scaled_valid_inputs, scaled_train_outputs, scaled_valid_outputs, input_scaler
def load_scaled_datasets(path_data, num_particles, output_cols,
scaler_out=False, subset=False, num_z_bins=False,
flatten_coord=False):
"""
Given a path to training or validation datset, the number of particles per
hologram, and output columns, returns scaled inputs and raw outputs.

Args:
path_data: (str) Path to dataset directory
num_particles: (int or str) Number of particles per hologram
output_cols: (list of strings) List of feature columns
scaler_out: (sklearn.preprocessing scaler) Output data scaler
subset: (float) Fraction of data to be loaded
num_z_bins: (int) Number of bins along z-axis
flatten_coord: (boolean) If True, flatten single coord by hid

Returns:
train_inputs: (np array) Train input data scaled between 0 and 1
train_outputs: (np array) Scaled train output data
valid_inputs: (np array) Valid input data scaled between 0 and 1
valid_outputs: (np array) Scaled valid output data
"""

train_inputs,\
train_outputs = load_raw_datasets(path_data, num_particles, 'train',
subset, output_cols)
valid_inputs,\
valid_outputs = load_raw_datasets(path_data, num_particles, 'valid',
subset, output_cols)

if flatten_coord:
train_hids = train_outputs["hid"].values
valid_hids = valid_outputs["hid"].values
train_outputs = train_outputs.drop(['hid'], axis=1)
valid_outputs = valid_outputs.drop(['hid'], axis=1)

train_inputs, scaler_in = scale_images(train_inputs)
valid_inputs, _ = scale_images(valid_inputs, scaler_in)
train_inputs = np.expand_dims(train_inputs, -1)
valid_inputs = np.expand_dims(valid_inputs, -1)

if num_z_bins:
z_bins = calc_z_bins(train_outputs, valid_outputs, num_z_bins)
train_outputs, _ = calc_z_relative_mass(outputs=train_outputs,
z_bins=z_bins)
valid_outputs, _ = calc_z_relative_mass(outputs=valid_outputs,
z_bins=z_bins)
else:
train_outputs = scaler_out.fit_transform(train_outputs)
valid_outputs = scaler_out.transform(valid_outputs)

if flatten_coord:
output_cols.remove("hid")
train_outputs = flatten_coordinate(train_outputs.flatten(),
train_hids, output_cols[0])
valid_outputs = flatten_coordinate(valid_outputs.flatten(),
valid_hids, output_cols[0])

if train_inputs.shape[0] != train_outputs.shape[0]:
factor = int(train_outputs.shape[0]/train_inputs.shape[0])
train_inputs = np.repeat(train_inputs, factor, axis=0)
factor = int(valid_outputs.shape[0]/valid_inputs.shape[0])
valid_inputs = np.repeat(valid_inputs, factor, axis=0)

return train_inputs, train_outputs, valid_inputs, valid_outputs
Loading