Source code for denspp.offline.template.call_dataset

import numpy as np
from os.path import exists, join
from denspp.offline.dnn.pytorch_config_data import SettingsDataset, ControllerDataset
from denspp.offline.dnn.model_library import CellLibrary
from denspp.offline.data_call.owncloud_handler import OwnCloudDownloader
from denspp.offline.data_process.frame_preprocessing import calculate_frame_snr, calculate_frame_mean
from denspp.offline.data_process.frame_preprocessing import generate_zero_frames
from denspp.offline.data_process.frame_normalization import DataNormalization
from denspp.offline.data_process.frame_augmentation import augmentation_change_position, augmentation_reducing_samples



[docs]
class DatasetLoader(ControllerDataset):
    _settings: SettingsDataset
    _path: str

    def __init__(self, settings: SettingsDataset, temp_folder: str='') -> None:
        """Class for downloading (function name with '__get_xyz')
        and preparing (function name with '__prepare_xyz') custom-defined datasets to train deep learning models
        :param settings:  Object of class SettingsDataset for handling dataset used in DeepLearning"""
        super().__init__(settings, temp_folder)

    def __download_spike(self, dataset_name: str) -> None:
        if not exists(self._settings.get_path2data):
            oc_handler = OwnCloudDownloader(self._path)
            oc_handler.download_file(
                use_dataset=True,
                file_name=dataset_name,
                destination_download=join(self._settings.get_path2folder, dataset_name)
            )
            oc_handler.close()

    def __preprocess_spike(self, add_noise_cluster: bool=False) -> dict:
        """Function for processing neural spike frame events from dataset
            :param add_noise_cluster:      Adding the noise cluster to dataset
            :return:                       Dict with {'data': frames_in, 'label': frames_cl, 'dict': frames_dict, 'mean': frames_me}
        """
        # --- Loading rawdata ['data'=frames, 'label'= label id, 'peak'=amplitude values, 'dict'=label names]
        rawdata = np.load(self._settings.get_path2data, allow_pickle=True).flatten()[0]
        frames_dict = rawdata['dict']
        frames_in = rawdata['data']
        frames_cl = rawdata['label']

        # --- Using cell_bib for clustering
        cell_libs_handler = CellLibrary().get_registry()
        libs_class_overview = [lib.split("resort_")[-1] for lib in cell_libs_handler.get_library_overview(do_print=False)]
        libs_use = [f'resort_{lib}' for lib in libs_class_overview if lib in self._settings.get_path2data.lower()]
        if len(libs_use):
            new_data = self.reconfigure_cluster_with_cell_lib(
                fn=cell_libs_handler.build(libs_use[0]),
                sel_mode_classes=self._settings.use_cell_sort_mode,
                frames_in=frames_in,
                frames_cl=frames_cl
            )
            frames_in = new_data['frame']
            frames_cl = new_data['cl']
            frames_dict = new_data['dict']

        # --- PART: Reducing samples per cluster (if too large)
        if self._settings.reduce_samples_per_cluster_do:
            print("... do data augmentation with reducing the samples per cluster")
            frames_in, frames_cl = augmentation_reducing_samples(
                frames_in=frames_in,
                frames_cl=frames_cl,
                num_frames=self._settings.reduce_samples_per_cluster_num,
                do_shuffle=False
            )

        # --- PART: Exclusion of selected clusters
        if not len(self._settings.exclude_cluster) == 0:
            for id in self._settings.exclude_cluster:
                selX = np.argwhere(frames_cl == id).flatten()
                frames_in = np.delete(frames_in, selX, 0)
                frames_cl = np.delete(frames_cl, selX, 0)
                if isinstance(frames_dict, list):
                    frames_dict.pop(id)

        # --- Generate dict with labeled names
        if isinstance(frames_dict, dict):
            frames_dict = list()
            for id in np.unique(frames_cl):
                frames_dict.append(f"Neuron #{id}")

        # --- PART: Data Normalization
        if self._settings.normalization_do:
            print(f"... do data normalization")
            data_class_frames_in = DataNormalization(self._settings.normalization_method)
            frames_in = data_class_frames_in.normalize(frames_in)

        # --- PART: Mean waveform calculation and data augmentation
        frames_me = calculate_frame_mean(frames_in, frames_cl)

        # --- PART: Calculate SNR if desired
        if self._settings.augmentation_do or add_noise_cluster:
            snr_mean = calculate_frame_snr(frames_in, frames_cl, frames_me)
        else:
            snr_mean = np.zeros(0, dtype=float)

        # --- PART: Data Augmentation
        if self._settings.augmentation_do and not self._settings.reduce_samples_per_cluster_do:
            print("... do data augmentation")
            new_frames, new_clusters = augmentation_change_position(
                frames_in=frames_in,
                frames_cl=frames_cl,
                num_min_frames=self._settings.augmentation_num
            )
            frames_in = np.append(frames_in, new_frames, axis=0)
            frames_cl = np.append(frames_cl, new_clusters, axis=0)

        # --- PART: Generate and add noise cluster
        if add_noise_cluster:
            snr_range_zero = [np.median(snr_mean[:, 0]), np.median(snr_mean[:, 2])]
            info = np.unique(frames_cl, return_counts=True)
            num_cluster = np.max(info[0]) + 1
            num_frames = np.max(info[1])
            print(f"... adding a zero-noise cluster: cluster = {num_cluster} - number of frames = {num_frames}")

            new_mean, new_clusters, new_frames = generate_zero_frames(frames_in.shape[1], num_frames, snr_range_zero)
            frames_in = np.append(frames_in, new_frames, axis=0)
            frames_cl = np.append(frames_cl, num_cluster + new_clusters, axis=0)
            frames_me = np.vstack([frames_me, new_mean])

        return {'data': frames_in, 'label': frames_cl, 'dict': frames_dict, 'mean': frames_me}

    def __get_martinez(self) -> None:
        self.__download_spike('2023-05-15_Dataset_Sim_Martinez2009_Sorted.npy')

    def __prepare_martinez(self) -> dict:
        return self.__preprocess_spike()

    def __get_quiroga(self) -> None:
        self.__download_spike('2023-06-30_Dataset_Sim_Quiroga2020_Sorted.npy')

    def __prepare_quiroga(self) -> dict:
        return self.__preprocess_spike()