Source code for mlscorecheck.aggregated._dataset

"""
This module implements an abstraction for a dataset
"""
# disabling pylint false positives
# pylint: disable=no-member

from ._utils import random_identifier

from ..experiments import dataset_statistics

__all__ = ["Dataset"]


[docs] class Dataset: """ The abstract representation of a dataset """ def __init__( self, p: int = None, n: int = None, dataset_name: str = None, identifier: str = None, ): """ Constructor of a dataset Args: p (None|int): the number of positives n (None|int): the number of negatives dataset_name (None|str): the name of the dataset in the mlscorecheck specification for example, 'common_datasets.ADA' identifier (None|str): the identifier of the dataset (randomly generated if None) """ if (p is None and n is not None) or (p is not None and n is None): raise ValueError("specify either p and n or neither of them") if p is None and dataset_name is None: raise ValueError("specify either p and n or the name") if p is not None and dataset_name is not None: raise ValueError("specify either p and n or the name") self.p = p self.n = n self.dataset_name = dataset_name self.resolve_pn() if identifier is None: self.identifier = ( f"{dataset_name}_{random_identifier(3)}" if dataset_name is not None else random_identifier(5) ) else: self.identifier = identifier
[docs] def resolve_pn(self): """ Resolves the ``p`` and ``n`` values from the name of the dataset """ if self.p is None: dataset = dataset_statistics[self.dataset_name] self.p = dataset["p"] self.n = dataset["n"]
[docs] def to_dict(self) -> dict: """ Dictionary representation of the dataset Returns: dict: to_dict """ return { "p": self.p if self.dataset_name is None else None, "n": self.n if self.dataset_name is None else None, "dataset_name": self.dataset_name, "identifier": self.identifier, }