Source code for podium.datasets.hf

"""
Module contains the converter class for processing the HuggingFace Datasets.
"""
import textwrap
from typing import Dict, Iterator, Optional, Union

from podium.datasets import Dataset, DatasetBase
from podium.field import Field, LabelField
from podium.utils.general_utils import repr_type_and_attrs
from podium.vocab import Vocab

from .example_factory import Example, ExampleFactory


try:
    import datasets
except ImportError:
    print(
        "Problem occured while trying to import datasets. "
        "If the library is not installed visit "
        "https://huggingface.co/docs/datasets/ for more details."
    )
    raise


def _identity(x):
    return x


def _convert_feature(name: str, feature: datasets.features.FeatureType) -> Field:
    """
    Function for converting features of the HuggingFace Dataset to the
    podium.Fields.

    Parameters
    ----------
    name : str
        Name of the column corresponding to the feature.
    feature : datasets.features.FeatureType
        Column feature.

    Returns
    -------
    podium.Field
        podium.Field after the conversion.

    Raises
    ------
    TypeError
        If conversion of the given feature type is not supported.
    """
    if isinstance(feature, datasets.ClassLabel):
        field = LabelField(name=name, numericalizer=_identity, allow_missing_data=False)
        return field

    elif isinstance(feature, datasets.Value):
        dtype = feature.dtype

        if dtype in {
            "bool",
            "binary",
            "large_binary",
            "uint8",
            "uint16",
            "uint32",
            "uint64",
            "int8",
            "int16",
            "int32",
            "int64",
            "float16",
            "float32",
            "float64",
            "float",  # alias for float32
            "double",  # alias for float64
        }:
            kwargs = {
                "tokenizer": None,
                "keep_raw": False,
                "numericalizer": _identity,
            }

        elif dtype in {"string", "large_string", "utf8"}:
            # Since the dataset won't actually be loaded, we have to
            # set eager to False here so .finalize_fields() triggers
            # the Vocab construction later.
            kwargs = {"numericalizer": Vocab(eager=False)}

        else:
            # the other dtypes are not processed and are stored as-is,
            # for the full list see: https://arrow.apache.org/docs/python/api/datatypes.html#factory-functions
            kwargs = {"tokenizer": None, "keep_raw": False, "numericalizer": None}

    elif isinstance(
        feature,
        (
            dict,
            list,
            datasets.Sequence,
            datasets.Translation,
            datasets.TranslationVariableLanguages,
        ),
    ):
        kwargs = {"tokenizer": None, "keep_raw": False, "numericalizer": None}

    else:
        TypeError(
            f"Conversion for feature type {type(feature).__name__} " "is not supported"
        )

    # allow missing data for all fields except
    # the ones corresponding to the datasets.ClassLabel
    kwargs.update({"allow_missing_data": True})
    field = Field(name=name, **kwargs)

    return field


def convert_features_to_fields(
    features: Dict[str, datasets.features.FeatureType]
) -> Dict[str, Field]:
    """
    Convert a dictionary that maps column names of the HuggingFace Dataset to
    the features into a dictionary that maps column names to podium.Fields.

    Parameters
    ----------
    features : dict(str, datasets.features.FeatureType)
        Dictionary that maps a column name to the feature.

    Returns
    -------
    dict(str, podium.Field)
        Dictionary that maps a column name to a podium.Field.
    """
    return {name: _convert_feature(name, feature) for name, feature in features.items()}


[docs]class HFDatasetConverter(DatasetBase):
    """
    Class for converting rows from the HuggingFace Datasets to podium.Examples.
    """

    def __init__(
        self, hf_dataset: datasets.Dataset, fields: Optional[Dict[str, Field]] = None
    ) -> None:
        """
        HFDatasetConverter constructor.

        Parameters
        ----------
        hf_dataset : datasets.Dataset
            HuggingFace Dataset.

        fields : dict(str, podium.Field)
            Dictionary that maps a column name of the dataset to a podium.Field.
            If passed None the default feature conversion rules will be used
            to build a dictonary from the dataset features.

        Raises
        ------
        TypeError
            If dataset is not an instance of datasets.Dataset.
        """
        if not isinstance(hf_dataset, datasets.Dataset):
            raise TypeError(
                "Incorrect dataset type. Expected datasets.Dataset, "
                f"but got {type(hf_dataset).__name__}"
            )

        super().__init__(fields or convert_features_to_fields(hf_dataset.features))
        self._dataset = hf_dataset
        self._example_factory = ExampleFactory(self.field_dict)

    @property
    def dataset(self):
        return self._dataset

    def _get_examples(self):
        yield from self

    def __getitem__(self, i):
        raw_examples = self.dataset[i]

        # Index or slice
        if isinstance(i, int):
            return self._example_factory.from_dict(raw_examples)
        else:
            # Slice of hf.datasets.Dataset is a dictionary that maps
            # to a list of values. To map this to a list of our examples,
            # we map the single dictionary to a list of dictionaries and
            # then convert this to a list of podium Examples

            # Unpack the dict, creating a dict for each value tuple
            raw_examples = [
                {k: v for k, v in zip(raw_examples, values)}
                for values in zip(*raw_examples.values())
            ]

            # Map each raw example to a Podium example
            examples = [
                self._example_factory.from_dict(raw_example)
                for raw_example in raw_examples
            ]

            # Cast to a dataset
            return Dataset(examples, self.fields, sort_key=None)

[docs]    def __iter__(self) -> Iterator[Example]:
        """
        Iterate through the dataset and convert the examples.
        """
        for raw_example in self._dataset:
            yield self._example_factory.from_dict(raw_example)

    def __len__(self) -> int:
        return len(self.dataset)

    def __repr__(self):
        fields_str = ",\n".join(textwrap.indent(repr(f), " " * 8) for f in self.fields)
        fields_str = f"[\n{fields_str}\n    \n]"
        attrs = {
            "dataset_name": self._dataset.builder_name,
            "size": len(self),
            "fields": fields_str,
        }
        return repr_type_and_attrs(self, attrs, with_newlines=True, repr_values=False)

[docs]    def as_dataset(self) -> Dataset:
        """
        Convert the original HuggingFace dataset to a podium.Dataset.

        Returns
        -------
        podium.Dataset
            podium.Dataset instance.
        """
        return Dataset(list(self), self.fields)

[docs]    @staticmethod
    def from_dataset_dict(
        dataset_dict: Dict[str, datasets.Dataset],
        fields: Optional[Dict[str, Field]] = None,
        cast_to_podium: bool = False,
    ) -> Dict[str, Union["HFDatasetConverter", Dataset]]:
        """
        Copies the keys of given dictionary and converts the corresponding
        HuggingFace Datasets to the HFDatasetConverter instances.

        Parameters
        ----------
        dataset_dict: dict(str, datasets.Dataset)
            Dictionary that maps dataset names to HuggingFace Datasets.

        cast_to_podium: bool
            Determines whether to immediately convert the HuggingFace dataset
            to Podium dataset (if True), or shallowly wrap the HuggingFace dataset
            in the HFDatasetConverter class.
            The HFDatasetConverter class currently doesn't support full Podium
            functionality and will not work with other components in the library.

        Returns
        -------
        dict(str, Union[HFDatasetConverter, podium.Dataset])
            Dictionary that maps dataset names to HFDatasetConverter instances.

        Raises
        ------
        TypeError
            If the given argument is not a dictionary.
        """
        if not isinstance(dataset_dict, dict):
            raise TypeError(
                "Incorrect argument type. Expected dict, "
                f"but got {type(dataset_dict).__name__}"
            )

        def cast(dataset):
            dataset = HFDatasetConverter(dataset, fields)
            return dataset.as_dataset() if cast_to_podium else dataset

        return {name: cast(dataset) for name, dataset in dataset_dict.items()}