Source code for podium.vectorizers.impl.glove

import os

from podium.storage import LargeResource
from podium.vectorizers.vectorizer import WordVectors, random_normal_default_vector


[docs]class GloVe(WordVectors):
    """
    Class represents concrete vector storage for GloVe vectors described in
    https://nlp.stanford.edu/projects/glove/ . Class contains a Large resource
    so that vectors could be automatically downloaded on first use.

    Attributes
    ----------
    NAME_URL_MAPPING : dict(str, str)
        dictionary that maps glove instance name to download url
    NAME_DIM_MAPPING : dict(str, set(str))
        dictionary that maps glove instance name to available vector dimensions
        for given instance
    _NAME_FILE_MAPPING : dict(str, str)
        dictionary that maps glove instance name to filenames available in vectors
        folder
    _ARCHIVE_TYPE : str
        type of arhive in which the vectors are stored while downloading
    _BINARY : bool
        defines if the vectors are stored in binary format or not. glove vectors
        are stored in binary format
    """

    NAME_URL_MAPPING = {
        "glove-common-42b": "http://nlp.stanford.edu/data/glove.42B.300d.zip",
        "glove-common-840b": "http://nlp.stanford.edu/data/glove.840B.300d.zip",
        "glove-twitter": "http://nlp.stanford.edu/data/glove.twitter.27B.zip",
        "glove-wikipedia": "http://nlp.stanford.edu/data/glove.6B.zip",
    }
    NAME_DIM_MAPPING = {
        "glove-common-42b": {
            300,
        },
        "glove-common-840b": {
            300,
        },
        "glove-twitter": {25, 50, 100, 200},
        "glove-wikipedia": {50, 100, 200, 300},
    }
    _NAME_FILE_MAPPING = {
        "glove-common-42b": "glove.42B",
        "glove-common-840b": "glove.840B.",
        "glove-twitter": "glove.twitter.27B",
        "glove-wikipedia": "glove.6B",
    }

    _ARCHIVE_TYPE = "zip"
    _BINARY = True

    def __init__(
        self,
        name="glove-wikipedia",
        dim=300,
        default_vector_function=random_normal_default_vector,
        cache_path=None,
        max_vectors=None,
    ):
        """
        GloVe constructor that initializes vector storage and downloads vectors
        if necessary.

        Parameters
        ----------
        name : str
            name of glove vectors instance, available names are available in
            NAME_URL_MAPPING dictionary
        dim : int
            vectors dimension, available dimensions are listed in NAME_DIM_MAPPING
            dictionary
        default_vector_function : callable, optional
            which vector should be returned if vectorizer doesn't have
            representation for given token. If None and token doesn't
            exists an error is raised while obtaining a vector
        cache_path : str
            path for caching vectors, useful if not loading all vectors from file
            by either loading some arbitrary number of vectors (see max_vectors) or
            by loading vectors for vocabulary.
        max_vectors : int
            maximum number of vectors to load in memory

        Raises
        ------
        ValueError
            If given name is not in NAME_URL_MAPPING keys or if the given vectors
            dimension is not available. Supported dimensions are available in
            NAME_DIM_MAPPING dictionary.
        """
        if name not in GloVe.NAME_URL_MAPPING.keys():
            raise ValueError(
                "Given name not supported, supported names are "
                f"{GloVe.NAME_URL_MAPPING.keys()}"
            )
        if dim not in GloVe.NAME_DIM_MAPPING[name]:
            raise ValueError(
                "Unsupported dimension for given glove instance, "
                f"{name} GloVe instance has following supported dimensions "
                f"{GloVe.NAME_DIM_MAPPING[name]}"
                ""
            )

        url = GloVe.NAME_URL_MAPPING[name]
        LargeResource(
            **{
                LargeResource.RESOURCE_NAME: name,
                LargeResource.ARCHIVE: GloVe._ARCHIVE_TYPE,
                LargeResource.URI: url,
            }
        )

        file_name = f"{GloVe._NAME_FILE_MAPPING[name]}.{dim}d.txt"
        path = os.path.join(LargeResource.BASE_RESOURCE_DIR, name, file_name)

        vectors_kwargs = {
            "default_vector_function": default_vector_function,
            "cache_path": cache_path,
            "max_vectors": max_vectors,
            "path": path,
            "binary": GloVe._BINARY,
        }
        super(GloVe, self).__init__(**vectors_kwargs)