# Copyright 2016-2023 The Van Valen Lab at the California Institute of
# Technology (Caltech), with support from the Paul Allen Family Foundation,
# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01.
# All rights reserved.
#
# Licensed under a modified Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.github.com/vanvalenlab/deepcell-tf/LICENSE
#
# The Work provided may be used for non-commercial academic purposes only.
# For any other use of the Work, including commercial use, please contact:
# vanvalenlab@gmail.com
#
# Neither the name of Caltech nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific
# prior written permission.
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions for making training data"""
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras import backend as K
from keras.utils import conv_utils
from deepcell.utils.tracking_utils import load_trks
[docs]
def get_data(file_name, mode='sample', test_size=.2, seed=0):
"""Load data from NPZ file and split into train and test sets
Args:
file_name (str): path to NPZ file to load
mode (str): if 'siamese_daughters', returns lineage information from
.trk file otherwise, returns the same data that was loaded.
test_size (float): percent of data to leave as testing holdout
seed (int): seed number for random train/test split repeatability
Returns:
(dict, dict): dict of training data, and a dict of testing data
"""
# siamese_daughters mode is used to import lineage data
# and associate it with the appropriate batch
if mode == 'siamese_daughters':
training_data = load_trks(file_name)
X = training_data['X']
y = training_data['y']
# `daughters` is of the form:
#
# 2 children / cell (potentially empty)
# ___________|__________
# / \
# daughers = [{id_1: [daughter_1, daughter_2], ...}, ]
# \___________________________________/
# |
# dict of (cell_id -> children)
#
# each batch has a separate (cell_id -> children) dict
daughters = [{cell: fields['daughters']
for cell, fields in tracks.items()}
for tracks in training_data['lineages']]
X_train, X_test, y_train, y_test, ln_train, ln_test = train_test_split(
X, y, daughters, test_size=test_size, random_state=seed)
train_dict = {
'X': X_train,
'y': y_train,
'daughters': ln_train
}
test_dict = {
'X': X_test,
'y': y_test,
'daughters': ln_test
}
return train_dict, test_dict
training_data = np.load(file_name)
X = training_data['X']
y = training_data['y']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=seed)
train_dict = {
'X': X_train,
'y': y_train
}
test_dict = {
'X': X_test,
'y': y_test
}
return train_dict, test_dict
[docs]
def get_max_sample_num_list(y, edge_feature, output_mode='sample', padding='valid',
window_size_x=30, window_size_y=30):
"""For each set of images and each feature, find the maximum number
of samples for to be used. This will be used to balance class sampling.
Args:
y (numpy.array): mask to indicate which pixels belong to which class
edge_feature (list): [1, 0, 0], the 1 indicates the feature
is the cell edge
output_mode (str): 'sample' or 'conv'
padding (str): 'valid' or 'same'
Returns:
list: list of maximum sample size for all classes
"""
list_of_max_sample_numbers = []
if padding == 'valid':
y = trim_padding(y, window_size_x, window_size_y)
# for each set of images
for j in range(y.shape[0]):
if output_mode == 'sample':
for k, edge_feat in enumerate(edge_feature):
if edge_feat == 1:
if K.image_data_format() == 'channels_first':
y_sum = np.sum(y[j, k, :, :])
else:
y_sum = np.sum(y[j, :, :, k])
list_of_max_sample_numbers.append(y_sum)
else:
list_of_max_sample_numbers.append(np.Inf)
return list_of_max_sample_numbers
[docs]
def sample_label_matrix(y, window_size=(30, 30), padding='valid',
max_training_examples=1e7, data_format=None):
"""Sample a 4D Tensor, creating many small images of shape window_size.
Args:
y (numpy.array): label masks with the same shape as X data
window_size (tuple): size of window around each pixel to sample
padding (str): padding type 'valid' or 'same'
max_training_examples (int): max number of samples per class
data_format (str): 'channels_first' or 'channels_last'
Returns:
tuple: 4 arrays of coordinates of each sampled pixel
"""
data_format = conv_utils.normalize_data_format(data_format)
is_channels_first = data_format == 'channels_first'
if is_channels_first:
num_dirs, num_features, image_size_x, image_size_y = y.shape
else:
num_dirs, image_size_x, image_size_y, num_features = y.shape
window_size = conv_utils.normalize_tuple(window_size, 2, 'window_size')
window_size_x, window_size_y = window_size
feature_rows, feature_cols, feature_batch, feature_label = [], [], [], []
for direc in range(num_dirs):
for k in range(num_features):
if is_channels_first:
feature_rows_temp, feature_cols_temp = np.where(y[direc, k, :, :] == 1)
else:
feature_rows_temp, feature_cols_temp = np.where(y[direc, :, :, k] == 1)
# Check to make sure the features are actually present
if not feature_rows_temp.size > 0:
continue
# Randomly permute index vector
non_rand_ind = np.arange(len(feature_rows_temp))
rand_ind = np.random.choice(non_rand_ind, size=len(feature_rows_temp), replace=False)
for i in rand_ind:
condition = padding == 'valid' and \
feature_rows_temp[i] - window_size_x > 0 and \
feature_rows_temp[i] + window_size_x < image_size_x and \
feature_cols_temp[i] - window_size_y > 0 and \
feature_cols_temp[i] + window_size_y < image_size_y
if padding == 'same' or condition:
feature_rows.append(feature_rows_temp[i])
feature_cols.append(feature_cols_temp[i])
feature_batch.append(direc)
feature_label.append(k)
# Randomize
non_rand_ind = np.arange(len(feature_rows), dtype='int32')
if not max_training_examples:
max_training_examples = non_rand_ind.size
else:
max_training_examples = int(max_training_examples)
limit = min(non_rand_ind.size, max_training_examples)
rand_ind = np.random.choice(non_rand_ind, size=limit, replace=False)
feature_rows = np.array(feature_rows, dtype='int32')[rand_ind]
feature_cols = np.array(feature_cols, dtype='int32')[rand_ind]
feature_batch = np.array(feature_batch, dtype='int32')[rand_ind]
feature_label = np.array(feature_label, dtype='int32')[rand_ind]
return feature_rows, feature_cols, feature_batch, feature_label
[docs]
def sample_label_movie(y, window_size=(30, 30, 5), padding='valid',
max_training_examples=1e7, data_format=None):
"""Sample a 5D Tensor, creating many small voxels of shape window_size.
Args:
y (numpy.array): label masks with the same shape as X data
window_size (tuple): size of window around each pixel to sample
padding (str): padding type 'valid' or 'same'
max_training_examples (int): max number of samples per class
data_format (str): 'channels_first' or 'channels_last'
Returns:
tuple: 5 arrays of coordinates of each sampled pixel
"""
data_format = conv_utils.normalize_data_format(data_format)
is_channels_first = data_format == 'channels_first'
if is_channels_first:
num_dirs, num_features, image_size_z, image_size_x, image_size_y = y.shape
else:
num_dirs, image_size_z, image_size_x, image_size_y, num_features = y.shape
window_size = conv_utils.normalize_tuple(window_size, 3, 'window_size')
window_size_x, window_size_y, window_size_z = window_size
feature_rows, feature_cols, feature_frames, feature_batch, feature_label = [], [], [], [], []
for d in range(num_dirs):
for k in range(num_features):
if is_channels_first:
frames_temp, rows_temp, cols_temp = np.where(y[d, k, :, :, :] == 1)
else:
frames_temp, rows_temp, cols_temp = np.where(y[d, :, :, :, k] == 1)
# Check to make sure the features are actually present
if not rows_temp.size > 0:
continue
# Randomly permute index vector
non_rand_ind = np.arange(len(rows_temp))
rand_ind = np.random.choice(non_rand_ind, size=len(rows_temp), replace=False)
for i in rand_ind:
condition = padding == 'valid' and \
frames_temp[i] - window_size_z > 0 and \
frames_temp[i] + window_size_z < image_size_z and \
rows_temp[i] - window_size_x > 0 and \
rows_temp[i] + window_size_x < image_size_x and \
cols_temp[i] - window_size_y > 0 and \
cols_temp[i] + window_size_y < image_size_y
if padding == 'same' or condition:
feature_rows.append(rows_temp[i])
feature_cols.append(cols_temp[i])
feature_frames.append(frames_temp[i])
feature_batch.append(d)
feature_label.append(k)
# Randomize
non_rand_ind = np.arange(len(feature_rows), dtype='int32')
if not max_training_examples:
max_training_examples = non_rand_ind.size
else:
max_training_examples = int(max_training_examples)
limit = min(non_rand_ind.size, max_training_examples)
rand_ind = np.random.choice(non_rand_ind, size=limit, replace=False)
feature_frames = np.array(feature_frames, dtype='int32')[rand_ind]
feature_rows = np.array(feature_rows, dtype='int32')[rand_ind]
feature_cols = np.array(feature_cols, dtype='int32')[rand_ind]
feature_batch = np.array(feature_batch, dtype='int32')[rand_ind]
feature_label = np.array(feature_label, dtype='int32')[rand_ind]
return feature_frames, feature_rows, feature_cols, feature_batch, feature_label
[docs]
def trim_padding(nparr, win_x, win_y, win_z=None):
"""Trim the boundaries of the numpy array to allow for a sliding
window of size (win_x, win_y) to not slide over regions without pixel data
Args:
nparr (numpy.array): numpy array to trim
win_x (int): number of row pixels to ignore on either side
win_y (int): number of column pixels to ignore on either side
win_y (int): number of column pixels to ignore on either side
Returns:
numpy.array: trimmed numpy array of size
``x - 2 * win_x - 1, y - 2 * win_y - 1``
Raises:
ValueError: nparr.ndim is not 4 or 5
"""
is_channels_first = K.image_data_format() == 'channels_first'
if nparr.ndim == 4:
if is_channels_first:
trimmed = nparr[:, :, win_x:-win_x, win_y:-win_y]
else:
trimmed = nparr[:, win_x:-win_x, win_y:-win_y, :]
elif nparr.ndim == 5:
if is_channels_first:
if win_z:
win_z = int(win_z)
trimmed = nparr[:, :, win_z:-win_z, win_x:-win_x, win_y:-win_y]
else:
trimmed = nparr[:, :, :, win_x:-win_x, win_y:-win_y]
else:
if win_z:
win_z = int(win_z)
trimmed = nparr[:, win_z:-win_z, win_x:-win_x, win_y:-win_y, :]
else:
trimmed = nparr[:, :, win_x:-win_x, win_y:-win_y, :]
else:
raise ValueError('Expected to trim numpy array of ndim 4 or 5, '
f'got "{nparr.ndim}"')
return trimmed
[docs]
def reshape_matrix(X, y, reshape_size=256):
"""
Reshape matrix of dimension 4 to have x and y of size reshape_size.
Adds overlapping slices to batches.
E.g. ``reshape_size`` of 256 yields
(1, 1024, 1024, 1) -> (16, 256, 256, 1)
The input image is divided into subimages of side length reshape_size,
with the last row and column of subimages overlapping the one before the
last if the original image side lengths are not divisible by
``reshape_size``.
Args:
X (numpy.array): raw 4D image tensor
y (numpy.array): label mask of 4D image data
reshape_size (int, list): size of the output tensor
If input is int, output images are square with side length equal
reshape_size. If it is a list of 2 ints, then the output images
size is reshape_size[0] x reshape_size[1]
Returns:
numpy.array: reshaped ``X`` and ``y`` 4D tensors
in ``shape[1:3] = (reshape_size, reshape_size)``,
if ``reshape_size`` is an ``int``,
and ``shape[1:3] = reshape_size``,
if ``reshape_size`` is a list of length 2
Raises:
ValueError: ``X.ndim`` is not 4
ValueError: ``y.ndim`` is not 4
"""
is_channels_first = K.image_data_format() == 'channels_first'
if X.ndim != 4:
raise ValueError('reshape_matrix expects X dim to be 4, got', X.ndim)
elif y.ndim != 4:
raise ValueError('reshape_matrix expects y dim to be 4, got', y.ndim)
if isinstance(reshape_size, int):
reshape_size_x = reshape_size_y = reshape_size
elif len(reshape_size) == 2 and all(isinstance(x, int) for x in reshape_size):
reshape_size_x, reshape_size_y = reshape_size
else:
raise ValueError('reshape_size must be an integer or an iterable containing 2 integers.')
image_size_x, image_size_y = X.shape[2:] if is_channels_first else X.shape[1:3]
rep_number_x = np.int_(np.ceil(image_size_x / reshape_size_x))
rep_number_y = np.int_(np.ceil(image_size_y / reshape_size_y))
new_batch_size = X.shape[0] * rep_number_x * rep_number_y
if is_channels_first:
new_X_shape = (new_batch_size, X.shape[1], reshape_size_x, reshape_size_y)
new_y_shape = (new_batch_size, y.shape[1], reshape_size_x, reshape_size_y)
else:
new_X_shape = (new_batch_size, reshape_size_x, reshape_size_y, X.shape[3])
new_y_shape = (new_batch_size, reshape_size_x, reshape_size_y, y.shape[3])
new_X = np.zeros(new_X_shape, dtype=K.floatx())
new_y = np.zeros(new_y_shape, dtype='int32')
counter = 0
for b in range(X.shape[0]):
for i in range(rep_number_x):
for j in range(rep_number_y):
_axis = 2 if is_channels_first else 1
if i != rep_number_x - 1:
x_start, x_end = i * reshape_size_x, (i + 1) * reshape_size_x
else:
x_start, x_end = -reshape_size_x, X.shape[_axis]
if j != rep_number_y - 1:
y_start, y_end = j * reshape_size_y, (j + 1) * reshape_size_y
else:
y_start, y_end = -reshape_size_y, y.shape[_axis + 1]
if is_channels_first:
new_X[counter] = X[b, :, x_start:x_end, y_start:y_end]
new_y[counter] = y[b, :, x_start:x_end, y_start:y_end]
else:
new_X[counter] = X[b, x_start:x_end, y_start:y_end, :]
new_y[counter] = y[b, x_start:x_end, y_start:y_end, :]
new_y[counter] = relabel_movie(new_y[counter])
counter += 1
print(f'Reshaped feature data from {y.shape} to {new_y.shape}')
print(f'Reshaped training data from {X.shape} to {new_X.shape}')
return new_X, new_y
[docs]
def relabel_movie(y):
"""Relabels unique instance IDs to be from 1 to N
Args:
y (numpy.array): tensor of integer labels
Returns:
numpy.array: relabeled tensor with sequential labels
"""
new_y = np.zeros(y.shape)
unique_cells = np.unique(y) # get all unique values of y
unique_cells = np.delete(unique_cells, 0) # remove 0, as it is background
relabel_ids = np.arange(1, len(unique_cells) + 1)
for cell_id, relabel_id in zip(unique_cells, relabel_ids):
cell_loc = np.where(y == cell_id)
new_y[cell_loc] = relabel_id
return new_y
[docs]
def reshape_movie(X, y, reshape_size=256):
"""
Reshape tensor of dimension 5 to have x and y of size ``reshape_size``.
Adds overlapping slices to batches.
E.g. ``reshape_size`` of 256 yields
``(1, 5, 1024, 1024, 1) -> (16, 5, 256, 256, 1)``
Args:
X (numpy.array): raw 5D image tensor
y (numpy.array): label mask of 5D image tensor
reshape_size (int): size of the square output tensor
Returns:
numpy.array: reshaped ``X`` and ``y`` tensors in shape
``(reshape_size, reshape_size)``
Raises:
ValueError: ``X.ndim`` is not 5
ValueError: ``y.ndim`` is not 5
"""
is_channels_first = K.image_data_format() == 'channels_first'
if X.ndim != 5:
raise ValueError(f'reshape_movie expects X dim to be 5, got {X.ndim}')
elif y.ndim != 5:
raise ValueError(f'reshape_movie expects y dim to be 5, got {y.ndim}')
image_size_x, image_size_y = X.shape[3:] if is_channels_first else X.shape[2:4]
rep_number = np.int_(np.ceil(image_size_x / reshape_size))
new_batch_size = X.shape[0] * (rep_number) ** 2
if is_channels_first:
new_X_shape = (new_batch_size, X.shape[1], X.shape[2], reshape_size, reshape_size)
new_y_shape = (new_batch_size, y.shape[1], y.shape[2], reshape_size, reshape_size)
else:
new_X_shape = (new_batch_size, X.shape[1], reshape_size, reshape_size, X.shape[4])
new_y_shape = (new_batch_size, y.shape[1], reshape_size, reshape_size, y.shape[4])
new_X = np.zeros(new_X_shape, dtype=K.floatx())
new_y = np.zeros(new_y_shape, dtype='int32')
counter = 0
row_axis = 3 if is_channels_first else 2
col_axis = 4 if is_channels_first else 3
for b in range(X.shape[0]):
for i in range(rep_number):
for j in range(rep_number):
if i != rep_number - 1:
x_start, x_end = i * reshape_size, (i + 1) * reshape_size
else:
x_start, x_end = -reshape_size, X.shape[row_axis]
if j != rep_number - 1:
y_start, y_end = j * reshape_size, (j + 1) * reshape_size
else:
y_start, y_end = -reshape_size, y.shape[col_axis]
if is_channels_first:
new_X[counter] = X[b, :, :, x_start:x_end, y_start:y_end]
new_y[counter] = relabel_movie(y[b, :, :, x_start:x_end, y_start:y_end])
else:
new_X[counter] = X[b, :, x_start:x_end, y_start:y_end, :]
new_y[counter] = relabel_movie(y[b, :, x_start:x_end, y_start:y_end, :])
counter += 1
print(f'Reshaped feature data from {y.shape} to {new_y.shape}')
print(f'Reshaped training data from {X.shape} to {new_X.shape}')
return new_X, new_y