Source code for denspp.offline.dnn.data_config

import numpy as np
from pathlib import Path
from logging import getLogger, Logger
from dataclasses import dataclass
from denspp.offline import get_path_to_project, check_elem_unique
from denspp.offline.data_call.owncloud_handler import OwnCloudDownloader



[docs]
@dataclass(frozen=True)
class DatasetFromFile:
    """Dataclass with data, labels and dict loaded externally
    Attributes:
        data:   Numpy array with dataset content, shape = [num. samples, dimension]
        label:  Numpy array with labels, shape = [num. samples]
        dict:   List with names for each class/label
        mean:   Numpy array with mean values, shape = [num. samples, dimension]
    """
    data: np.ndarray
    label: np.ndarray
    dict: list
    mean: np.ndarray




[docs]
@dataclass
class SettingsDataset:
    """Class for handling preparation of dataset
    Attributes:
        data_path:              String with path to dataset
        data_type:              String with name of unique key to identify dataset to load [e.g. Waveform, MNIST, ...]
        use_cell_sort_mode:     Number for building a sub-dataset from original dataset [0: None, 1: Reduced, 2: Type, 3: Group]
        augmentation_do:        Boolean for applying data augmentation (only 1D data)
        augmentation_num:       Number of the samples of each class
        normalization_do:       Boolean for applying data normalization
        normalization_method:   String with applied normalization method ['zeroone', 'minmax', 'norm', 'zscore', 'medianmad', 'meanmad']
        reduce_samples_per_cluster_do:  Boolean for reducing number of samples per class
        reduce_samples_per_cluster_num: Number of reduced samples per class
        exclude_cluster:        List with IDs for excluding cluster/label IDs
    """
    # --- Settings of Datasets
    data_path: str
    data_type: str
    use_cell_sort_mode: int
    # --- Data Augmentation
    augmentation_do: bool
    augmentation_num: int
    normalization_do: bool
    normalization_method: str
    reduce_samples_per_cluster_do: bool
    reduce_samples_per_cluster_num: int
    # --- Dataset Preparation
    exclude_cluster: list

    @property
    def get_path2folder(self) -> Path:
        """Getting the path name to the file"""
        if not Path(self.data_path).is_absolute():
            path = self.get_path2folder_project / self.data_path
        else:
            path = Path(self.data_path)
        return path.absolute()

    @property
    def get_path2folder_project(self) -> Path:
        """Getting the default path of the Python Project"""
        return Path(get_path_to_project())



DefaultSettingsDataset = SettingsDataset(
    data_path='dataset',
    data_type='',
    use_cell_sort_mode=0,
    augmentation_do=False,
    augmentation_num=0,
    normalization_do=False,
    normalization_method='minmax',
    reduce_samples_per_cluster_do=False,
    reduce_samples_per_cluster_num=0,
    exclude_cluster=[]
)



[docs]
class ControllerDataset:
    _logger: Logger
    _settings: SettingsDataset
    _methods: list
    _index_search: list=['_get_', '_prepare_']
    _path: Path

    def __init__(self, settings: SettingsDataset, temp_folder: str='') -> None:
        self._settings = settings
        self._logger = getLogger(__name__)
        self._methods = self._extract_func(self.__class__)
        self._path = self._settings.get_path2folder_project / temp_folder

    @property
    def get_overview_methods(self) -> list:
        """Returning a list with string of all available dataset methods"""
        return self._methods

    @property
    def get_path2folder(self) -> Path:
        """Returning the absolute path to the folder"""
        return self._path.absolute()

    def _extract_func(self, class_obj: object) -> list:
        return [method for method in dir(class_obj) if self._index_search[0] in method or self._index_search[1] in method]

    def _extract_methods(self, search_index: str) -> list:
        return [method.split('_')[-1].lower() for method in self._methods if search_index in method]

    def _extract_executive_method(self, search_index: str) -> int:
        used_data_source_idx = -1
        for idx, method in enumerate(self._methods):
            check = method.split(search_index)[-1].lower()
            if self._settings.data_type.lower() == check:
                used_data_source_idx = idx
                break
        return used_data_source_idx

    def __download_if_missing(self) -> None:
        idx = self._extract_executive_method(self._index_search[0])
        if idx == -1:
            raise NotImplementedError
        else:
            getattr(self, self._methods[idx])()

    def __process_data(self) -> DatasetFromFile:
        idx = self._extract_executive_method(self._index_search[1])
        if idx == -1:
            raise NotImplementedError
        else:
            return getattr(self, self._methods[idx])()


[docs]
    def print_overview_datasets(self, do_print: bool=True) -> list:
        """Giving an overview of available datasets on the cloud storage
        :return:            Return a list with dataset names
        """
        oc_handler = OwnCloudDownloader(path2config=str(self._path))
        list_datasets = self._extract_methods(self._index_search[1])
        list_datasets.extend(oc_handler.get_overview_data(use_dataset=True))
        if do_print:
            self._logger.info("\nAvailable datasets in repository and from remote:")
            self._logger.info("==================================================")
            for idx, file in enumerate(list_datasets):
                self._logger.info(f"\t{idx}: \t{file}")

        oc_handler.close()
        return list_datasets



[docs]
    def print_dataset_properties(self, data: DatasetFromFile) -> None:
        """Printing the properties of the loaded dataset
        :param data:    Dataclas DatasetFromFile loaded externally
        :return:        None
        """
        check = np.unique(data.label, return_counts=True)
        self._logger.info(f"... for training are {data.data.shape[0]} frames with each "
                          f"({data.data.shape[1]}, {data.data.shape[2]}) points available")
        self._logger.info(f"... used data points for training: "
                          f"in total {check[0].size} classes with {np.sum(check[1])} samples")
        for idx, id0 in enumerate(check[0]):
            addon = f'' if len(data.dict) == 0 else f' ({data.dict[idx]})'
            self._logger.info(f"\tclass {id0}{addon} --> {check[1][idx]} samples")



[docs]
    def load_dataset(self, do_print: bool=True) -> DatasetFromFile:
        """Loading the dataset from defined data file
        :return:    Dataclass DatasetFromFile with attributes ['data', 'label', 'dict', 'mean']
        """
        if self._settings.data_type.lower() == '':
            self.print_overview_datasets(do_print=do_print)
            raise AttributeError("--- Dataset is not available. Please type-in the data set name into the yaml file ---")
        else:
            self._settings.get_path2folder.mkdir(parents=True, exist_ok=True)
            self.__download_if_missing()
            return self.__process_data()


    def _download_file(self, dataset_name: str) -> None:
        # TODO: Error - Files will always be downloaded
        # TODO: Definition of get_path2data is wrong
        if not self._settings.get_path2folder.exists():
            oc_handler = OwnCloudDownloader(str(self._path))
            oc_handler.download_file(
                use_dataset=True,
                file_name=dataset_name,
                destination_download=str(self._settings.get_path2folder / dataset_name)
            )
            oc_handler.close()




[docs]
@dataclass(frozen=True)
class TransformLabels:
    """
    Dataclass for transforming true and predicted labels into new scheme
    Attributes:
        true:    Numpy array with true labels
        pred:    Numpy array with predicted labels
    """
    true: np.ndarray
    pred: np.ndarray




[docs]
def logic_combination(labels_in: TransformLabels, translate_list: list) -> TransformLabels:
    """Combination of logic for Reducing Label Classes
    :param labels_in:       Dataclass with labels for true and predicted case
    :param translate_list:  List with label ids to combine (e.g. [[1, 2], [0, 3]] -> [0, 1])
    :returns:               Transformed new dataclass
    """
    assert labels_in.true.shape == labels_in.pred.shape, "Shape of labels are not equal"
    assert len(translate_list), "List with new translation is empty"
    assert check_elem_unique(translate_list), "Not all key elements in sublists are unique"

    true_labels_new = np.zeros_like(labels_in.true, dtype=np.uint8)
    pred_labels_new = np.zeros_like(labels_in.pred, dtype=np.uint8)

    for idx, cluster in enumerate(translate_list):
        for id0 in cluster:
            pos = np.argwhere(labels_in.true == id0).flatten()
            true_labels_new[pos] = idx
            pos = np.argwhere(labels_in.pred == id0).flatten()
            pred_labels_new[pos] = idx
    return TransformLabels(
        true=true_labels_new,
        pred=pred_labels_new
    )