Source code for deepcell.utils.data_utils

# Copyright 2016-2023 The Van Valen Lab at the California Institute of
# Technology (Caltech), with support from the Paul Allen Family Foundation,
# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01.
# All rights reserved.
#
# Licensed under a modified Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.github.com/vanvalenlab/deepcell-tf/LICENSE
#
# The Work provided may be used for non-commercial academic purposes only.
# For any other use of the Work, including commercial use, please contact:
# vanvalenlab@gmail.com
#
# Neither the name of Caltech nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific
# prior written permission.
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions for making training data"""


import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras import backend as K
from keras.utils import conv_utils

from deepcell.utils.tracking_utils import load_trks



[docs]
def get_data(file_name, mode='sample', test_size=.2, seed=0):
    """Load data from NPZ file and split into train and test sets

    Args:
        file_name (str): path to NPZ file to load
        mode (str): if 'siamese_daughters', returns lineage information from
            .trk file otherwise, returns the same data that was loaded.
        test_size (float): percent of data to leave as testing holdout
        seed (int): seed number for random train/test split repeatability

    Returns:
        (dict, dict): dict of training data, and a dict of testing data
    """
    # siamese_daughters mode is used to import lineage data
    # and associate it with the appropriate batch
    if mode == 'siamese_daughters':
        training_data = load_trks(file_name)
        X = training_data['X']
        y = training_data['y']
        # `daughters` is of the form:
        #
        #                   2 children / cell (potentially empty)
        #                          ___________|__________
        #                         /                      \
        #      daughers = [{id_1: [daughter_1, daughter_2], ...}, ]
        #                  \___________________________________/
        #                                    |
        #                       dict of (cell_id -> children)
        #
        # each batch has a separate (cell_id -> children) dict
        daughters = [{cell: fields['daughters']
                      for cell, fields in tracks.items()}
                     for tracks in training_data['lineages']]

        X_train, X_test, y_train, y_test, ln_train, ln_test = train_test_split(
            X, y, daughters, test_size=test_size, random_state=seed)

        train_dict = {
            'X': X_train,
            'y': y_train,
            'daughters': ln_train
        }

        test_dict = {
            'X': X_test,
            'y': y_test,
            'daughters': ln_test
        }
        return train_dict, test_dict

    training_data = np.load(file_name)
    X = training_data['X']
    y = training_data['y']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=seed)

    train_dict = {
        'X': X_train,
        'y': y_train
    }

    test_dict = {
        'X': X_test,
        'y': y_test
    }

    return train_dict, test_dict




[docs]
def get_max_sample_num_list(y, edge_feature, output_mode='sample', padding='valid',
                            window_size_x=30, window_size_y=30):
    """For each set of images and each feature, find the maximum number
    of samples for to be used. This will be used to balance class sampling.

    Args:
        y (numpy.array): mask to indicate which pixels belong to which class
        edge_feature (list): [1, 0, 0], the 1 indicates the feature
            is the cell edge
        output_mode (str):  'sample' or 'conv'
        padding (str): 'valid' or 'same'

    Returns:
        list: list of maximum sample size for all classes
    """
    list_of_max_sample_numbers = []

    if padding == 'valid':
        y = trim_padding(y, window_size_x, window_size_y)

    # for each set of images
    for j in range(y.shape[0]):
        if output_mode == 'sample':
            for k, edge_feat in enumerate(edge_feature):
                if edge_feat == 1:
                    if K.image_data_format() == 'channels_first':
                        y_sum = np.sum(y[j, k, :, :])
                    else:
                        y_sum = np.sum(y[j, :, :, k])
                    list_of_max_sample_numbers.append(y_sum)

        else:
            list_of_max_sample_numbers.append(np.Inf)

    return list_of_max_sample_numbers




[docs]
def sample_label_matrix(y, window_size=(30, 30), padding='valid',
                        max_training_examples=1e7, data_format=None):
    """Sample a 4D Tensor, creating many small images of shape window_size.

    Args:
        y (numpy.array): label masks with the same shape as X data
        window_size (tuple): size of window around each pixel to sample
        padding (str): padding type 'valid' or 'same'
        max_training_examples (int): max number of samples per class
        data_format (str): 'channels_first' or 'channels_last'

    Returns:
        tuple: 4 arrays of coordinates of each sampled pixel
    """
    data_format = conv_utils.normalize_data_format(data_format)
    is_channels_first = data_format == 'channels_first'
    if is_channels_first:
        num_dirs, num_features, image_size_x, image_size_y = y.shape
    else:
        num_dirs, image_size_x, image_size_y, num_features = y.shape

    window_size = conv_utils.normalize_tuple(window_size, 2, 'window_size')
    window_size_x, window_size_y = window_size

    feature_rows, feature_cols, feature_batch, feature_label = [], [], [], []

    for direc in range(num_dirs):
        for k in range(num_features):
            if is_channels_first:
                feature_rows_temp, feature_cols_temp = np.where(y[direc, k, :, :] == 1)
            else:
                feature_rows_temp, feature_cols_temp = np.where(y[direc, :, :, k] == 1)

            # Check to make sure the features are actually present
            if not feature_rows_temp.size > 0:
                continue

            # Randomly permute index vector
            non_rand_ind = np.arange(len(feature_rows_temp))
            rand_ind = np.random.choice(non_rand_ind, size=len(feature_rows_temp), replace=False)

            for i in rand_ind:
                condition = padding == 'valid' and \
                    feature_rows_temp[i] - window_size_x > 0 and \
                    feature_rows_temp[i] + window_size_x < image_size_x and \
                    feature_cols_temp[i] - window_size_y > 0 and \
                    feature_cols_temp[i] + window_size_y < image_size_y

                if padding == 'same' or condition:
                    feature_rows.append(feature_rows_temp[i])
                    feature_cols.append(feature_cols_temp[i])
                    feature_batch.append(direc)
                    feature_label.append(k)

    # Randomize
    non_rand_ind = np.arange(len(feature_rows), dtype='int32')
    if not max_training_examples:
        max_training_examples = non_rand_ind.size
    else:
        max_training_examples = int(max_training_examples)

    limit = min(non_rand_ind.size, max_training_examples)
    rand_ind = np.random.choice(non_rand_ind, size=limit, replace=False)

    feature_rows = np.array(feature_rows, dtype='int32')[rand_ind]
    feature_cols = np.array(feature_cols, dtype='int32')[rand_ind]
    feature_batch = np.array(feature_batch, dtype='int32')[rand_ind]
    feature_label = np.array(feature_label, dtype='int32')[rand_ind]

    return feature_rows, feature_cols, feature_batch, feature_label




[docs]
def sample_label_movie(y, window_size=(30, 30, 5), padding='valid',
                       max_training_examples=1e7, data_format=None):
    """Sample a 5D Tensor, creating many small voxels of shape window_size.

    Args:
        y (numpy.array): label masks with the same shape as X data
        window_size (tuple): size of window around each pixel to sample
        padding (str): padding type 'valid' or 'same'
        max_training_examples (int): max number of samples per class
        data_format (str): 'channels_first' or 'channels_last'

    Returns:
        tuple: 5 arrays of coordinates of each sampled pixel
    """
    data_format = conv_utils.normalize_data_format(data_format)
    is_channels_first = data_format == 'channels_first'
    if is_channels_first:
        num_dirs, num_features, image_size_z, image_size_x, image_size_y = y.shape
    else:
        num_dirs, image_size_z, image_size_x, image_size_y, num_features = y.shape

    window_size = conv_utils.normalize_tuple(window_size, 3, 'window_size')
    window_size_x, window_size_y, window_size_z = window_size

    feature_rows, feature_cols, feature_frames, feature_batch, feature_label = [], [], [], [], []

    for d in range(num_dirs):
        for k in range(num_features):
            if is_channels_first:
                frames_temp, rows_temp, cols_temp = np.where(y[d, k, :, :, :] == 1)
            else:
                frames_temp, rows_temp, cols_temp = np.where(y[d, :, :, :, k] == 1)

            # Check to make sure the features are actually present
            if not rows_temp.size > 0:
                continue

            # Randomly permute index vector
            non_rand_ind = np.arange(len(rows_temp))
            rand_ind = np.random.choice(non_rand_ind, size=len(rows_temp), replace=False)

            for i in rand_ind:
                condition = padding == 'valid' and \
                    frames_temp[i] - window_size_z > 0 and \
                    frames_temp[i] + window_size_z < image_size_z and \
                    rows_temp[i] - window_size_x > 0 and \
                    rows_temp[i] + window_size_x < image_size_x and \
                    cols_temp[i] - window_size_y > 0 and \
                    cols_temp[i] + window_size_y < image_size_y

                if padding == 'same' or condition:
                    feature_rows.append(rows_temp[i])
                    feature_cols.append(cols_temp[i])
                    feature_frames.append(frames_temp[i])
                    feature_batch.append(d)
                    feature_label.append(k)

    # Randomize
    non_rand_ind = np.arange(len(feature_rows), dtype='int32')
    if not max_training_examples:
        max_training_examples = non_rand_ind.size
    else:
        max_training_examples = int(max_training_examples)

    limit = min(non_rand_ind.size, max_training_examples)
    rand_ind = np.random.choice(non_rand_ind, size=limit, replace=False)

    feature_frames = np.array(feature_frames, dtype='int32')[rand_ind]
    feature_rows = np.array(feature_rows, dtype='int32')[rand_ind]
    feature_cols = np.array(feature_cols, dtype='int32')[rand_ind]
    feature_batch = np.array(feature_batch, dtype='int32')[rand_ind]
    feature_label = np.array(feature_label, dtype='int32')[rand_ind]

    return feature_frames, feature_rows, feature_cols, feature_batch, feature_label




[docs]
def trim_padding(nparr, win_x, win_y, win_z=None):
    """Trim the boundaries of the numpy array to allow for a sliding
    window of size (win_x, win_y) to not slide over regions without pixel data

    Args:
        nparr (numpy.array): numpy array to trim
        win_x (int): number of row pixels to ignore on either side
        win_y (int): number of column pixels to ignore on either side
        win_y (int): number of column pixels to ignore on either side

    Returns:
        numpy.array: trimmed numpy array of size
        ``x - 2 * win_x - 1, y - 2 * win_y - 1``

    Raises:
        ValueError: nparr.ndim is not 4 or 5
    """
    is_channels_first = K.image_data_format() == 'channels_first'
    if nparr.ndim == 4:
        if is_channels_first:
            trimmed = nparr[:, :, win_x:-win_x, win_y:-win_y]
        else:
            trimmed = nparr[:, win_x:-win_x, win_y:-win_y, :]
    elif nparr.ndim == 5:
        if is_channels_first:
            if win_z:
                win_z = int(win_z)
                trimmed = nparr[:, :, win_z:-win_z, win_x:-win_x, win_y:-win_y]
            else:
                trimmed = nparr[:, :, :, win_x:-win_x, win_y:-win_y]
        else:
            if win_z:
                win_z = int(win_z)
                trimmed = nparr[:, win_z:-win_z, win_x:-win_x, win_y:-win_y, :]
            else:
                trimmed = nparr[:, :, win_x:-win_x, win_y:-win_y, :]
    else:
        raise ValueError('Expected to trim numpy array of ndim 4 or 5, '
                         f'got "{nparr.ndim}"')
    return trimmed




[docs]
def reshape_matrix(X, y, reshape_size=256):
    """
    Reshape matrix of dimension 4 to have x and y of size reshape_size.
    Adds overlapping slices to batches.
    E.g. ``reshape_size`` of 256 yields
    (1, 1024, 1024, 1) -> (16, 256, 256, 1)
    The input image is divided into subimages of side length reshape_size,
    with the last row and column of subimages overlapping the one before the
    last if the original image side lengths are not divisible by
    ``reshape_size``.

    Args:
        X (numpy.array): raw 4D image tensor
        y (numpy.array): label mask of 4D image data
        reshape_size (int, list): size of the output tensor
            If input is int, output images are square with side length equal
            reshape_size. If it is a list of 2 ints, then the output images
            size is reshape_size[0] x reshape_size[1]

    Returns:
        numpy.array: reshaped ``X`` and ``y`` 4D tensors
        in ``shape[1:3] = (reshape_size, reshape_size)``,
        if ``reshape_size`` is an ``int``,
        and ``shape[1:3] = reshape_size``,
        if ``reshape_size`` is a list of length 2

    Raises:
        ValueError: ``X.ndim`` is not 4
        ValueError: ``y.ndim`` is not 4
    """
    is_channels_first = K.image_data_format() == 'channels_first'
    if X.ndim != 4:
        raise ValueError('reshape_matrix expects X dim to be 4, got', X.ndim)
    elif y.ndim != 4:
        raise ValueError('reshape_matrix expects y dim to be 4, got', y.ndim)

    if isinstance(reshape_size, int):
        reshape_size_x = reshape_size_y = reshape_size
    elif len(reshape_size) == 2 and all(isinstance(x, int) for x in reshape_size):
        reshape_size_x, reshape_size_y = reshape_size
    else:
        raise ValueError('reshape_size must be an integer or an iterable containing 2 integers.')

    image_size_x, image_size_y = X.shape[2:] if is_channels_first else X.shape[1:3]
    rep_number_x = np.int_(np.ceil(image_size_x / reshape_size_x))
    rep_number_y = np.int_(np.ceil(image_size_y / reshape_size_y))
    new_batch_size = X.shape[0] * rep_number_x * rep_number_y

    if is_channels_first:
        new_X_shape = (new_batch_size, X.shape[1], reshape_size_x, reshape_size_y)
        new_y_shape = (new_batch_size, y.shape[1], reshape_size_x, reshape_size_y)
    else:
        new_X_shape = (new_batch_size, reshape_size_x, reshape_size_y, X.shape[3])
        new_y_shape = (new_batch_size, reshape_size_x, reshape_size_y, y.shape[3])

    new_X = np.zeros(new_X_shape, dtype=K.floatx())
    new_y = np.zeros(new_y_shape, dtype='int32')

    counter = 0
    for b in range(X.shape[0]):
        for i in range(rep_number_x):
            for j in range(rep_number_y):
                _axis = 2 if is_channels_first else 1
                if i != rep_number_x - 1:
                    x_start, x_end = i * reshape_size_x, (i + 1) * reshape_size_x
                else:
                    x_start, x_end = -reshape_size_x, X.shape[_axis]

                if j != rep_number_y - 1:
                    y_start, y_end = j * reshape_size_y, (j + 1) * reshape_size_y
                else:
                    y_start, y_end = -reshape_size_y, y.shape[_axis + 1]

                if is_channels_first:
                    new_X[counter] = X[b, :, x_start:x_end, y_start:y_end]
                    new_y[counter] = y[b, :, x_start:x_end, y_start:y_end]
                else:
                    new_X[counter] = X[b, x_start:x_end, y_start:y_end, :]
                    new_y[counter] = y[b, x_start:x_end, y_start:y_end, :]

                new_y[counter] = relabel_movie(new_y[counter])
                counter += 1

    print(f'Reshaped feature data from {y.shape} to {new_y.shape}')
    print(f'Reshaped training data from {X.shape} to {new_X.shape}')
    return new_X, new_y




[docs]
def relabel_movie(y):
    """Relabels unique instance IDs to be from 1 to N

    Args:
        y (numpy.array): tensor of integer labels

    Returns:
        numpy.array: relabeled tensor with sequential labels
    """
    new_y = np.zeros(y.shape)
    unique_cells = np.unique(y)  # get all unique values of y
    unique_cells = np.delete(unique_cells, 0)  # remove 0, as it is background
    relabel_ids = np.arange(1, len(unique_cells) + 1)
    for cell_id, relabel_id in zip(unique_cells, relabel_ids):
        cell_loc = np.where(y == cell_id)
        new_y[cell_loc] = relabel_id
    return new_y




[docs]
def reshape_movie(X, y, reshape_size=256):
    """
    Reshape tensor of dimension 5 to have x and y of size ``reshape_size``.
    Adds overlapping slices to batches.
    E.g. ``reshape_size`` of 256 yields
    ``(1, 5, 1024, 1024, 1) -> (16, 5, 256, 256, 1)``

    Args:
        X (numpy.array): raw 5D image tensor
        y (numpy.array): label mask of 5D image tensor
        reshape_size (int): size of the square output tensor

    Returns:
        numpy.array: reshaped ``X`` and ``y`` tensors in shape
        ``(reshape_size, reshape_size)``

    Raises:
        ValueError: ``X.ndim`` is not 5
        ValueError: ``y.ndim`` is not 5
    """
    is_channels_first = K.image_data_format() == 'channels_first'
    if X.ndim != 5:
        raise ValueError(f'reshape_movie expects X dim to be 5, got {X.ndim}')
    elif y.ndim != 5:
        raise ValueError(f'reshape_movie expects y dim to be 5, got {y.ndim}')
    image_size_x, image_size_y = X.shape[3:] if is_channels_first else X.shape[2:4]
    rep_number = np.int_(np.ceil(image_size_x / reshape_size))
    new_batch_size = X.shape[0] * (rep_number) ** 2

    if is_channels_first:
        new_X_shape = (new_batch_size, X.shape[1], X.shape[2], reshape_size, reshape_size)
        new_y_shape = (new_batch_size, y.shape[1], y.shape[2], reshape_size, reshape_size)
    else:
        new_X_shape = (new_batch_size, X.shape[1], reshape_size, reshape_size, X.shape[4])
        new_y_shape = (new_batch_size, y.shape[1], reshape_size, reshape_size, y.shape[4])

    new_X = np.zeros(new_X_shape, dtype=K.floatx())
    new_y = np.zeros(new_y_shape, dtype='int32')

    counter = 0
    row_axis = 3 if is_channels_first else 2
    col_axis = 4 if is_channels_first else 3
    for b in range(X.shape[0]):
        for i in range(rep_number):
            for j in range(rep_number):
                if i != rep_number - 1:
                    x_start, x_end = i * reshape_size, (i + 1) * reshape_size
                else:
                    x_start, x_end = -reshape_size, X.shape[row_axis]
                if j != rep_number - 1:
                    y_start, y_end = j * reshape_size, (j + 1) * reshape_size
                else:
                    y_start, y_end = -reshape_size, y.shape[col_axis]

                if is_channels_first:
                    new_X[counter] = X[b, :, :, x_start:x_end, y_start:y_end]
                    new_y[counter] = relabel_movie(y[b, :, :, x_start:x_end, y_start:y_end])
                else:
                    new_X[counter] = X[b, :, x_start:x_end, y_start:y_end, :]
                    new_y[counter] = relabel_movie(y[b, :, x_start:x_end, y_start:y_end, :])

                counter += 1

    print(f'Reshaped feature data from {y.shape} to {new_y.shape}')
    print(f'Reshaped training data from {X.shape} to {new_X.shape}')
    return new_X, new_y