Source code for mlscorecheck.aggregated._dataset

"""
This module implements an abstraction for a dataset
"""
# disabling pylint false positives
# pylint: disable=no-member

from ._utils import random_identifier

from ..experiments import dataset_statistics

__all__ = ["Dataset"]



[docs]
class Dataset:
    """
    The abstract representation of a dataset
    """

    def __init__(
        self,
        p: int = None,
        n: int = None,
        dataset_name: str = None,
        identifier: str = None,
    ):
        """
        Constructor of a dataset

        Args:
            p (None|int): the number of positives
            n (None|int): the number of negatives
            dataset_name (None|str): the name of the dataset in the mlscorecheck specification
                            for example, 'common_datasets.ADA'
            identifier (None|str): the identifier of the dataset (randomly generated if
                                    None)
        """
        if (p is None and n is not None) or (p is not None and n is None):
            raise ValueError("specify either p and n or neither of them")
        if p is None and dataset_name is None:
            raise ValueError("specify either p and n or the name")
        if p is not None and dataset_name is not None:
            raise ValueError("specify either p and n or the name")

        self.p = p
        self.n = n
        self.dataset_name = dataset_name

        self.resolve_pn()

        if identifier is None:
            self.identifier = (
                f"{dataset_name}_{random_identifier(3)}"
                if dataset_name is not None
                else random_identifier(5)
            )
        else:
            self.identifier = identifier


[docs]
    def resolve_pn(self):
        """
        Resolves the ``p`` and ``n`` values from the name of the dataset
        """
        if self.p is None:
            dataset = dataset_statistics[self.dataset_name]
            self.p = dataset["p"]
            self.n = dataset["n"]



[docs]
    def to_dict(self) -> dict:
        """
        Dictionary representation of the dataset

        Returns:
            dict: to_dict
        """
        return {
            "p": self.p if self.dataset_name is None else None,
            "n": self.n if self.dataset_name is None else None,
            "dataset_name": self.dataset_name,
            "identifier": self.identifier,
        }