Source code for podium.datasets.impl.imdb

"""
Module contains IMDB Large Movie Review Dataset Dataset webpage:
http://ai.stanford.edu/~amaas/data/sentiment/

When using this dataset, please cite:
    @InProceedings{maas-EtAl:2011:ACL-HLT2011,
    author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and
    Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
    title     = {Learning Word Vectors for Sentiment Analysis},
    booktitle = {Proceedings of the 49th Annual Meeting of the Association for
    Computational Linguistics: Human Language Technologies},
    month     = {June},
    year      = {2011},
    address   = {Portland, Oregon, USA},
    publisher = {Association for Computational Linguistics},
    pages     = {142--150},
    url       = {http://www.aclweb.org/anthology/P11-1015}
    }
"""

import os

from podium.datasets.dataset import Dataset
from podium.datasets.example_factory import ExampleFactory
from podium.field import Field, LabelField
from podium.storage.resources.large_resource import LargeResource
from podium.vocab import Vocab


[docs]class IMDB(Dataset): """ Simple Imdb dataset with only supervised data which uses non processed data. Attributes ---------- NAME : str dataset name URL : str url to the imdb dataset DATASET_DIR : str name of the folder in the dataset containing train and test directories ARCHIVE_TYPE : str string that defines archive type, used for unpacking dataset TRAIN_DIR : str name of the training directory TEST_DIR : str name of the directory containing test examples POSITIVE_LABEL_DIR : str name of the subdirectory containing examples with positive sentiment NEGATIVE_LABEL_DIR : str name of the subdirectory containing examples with negative sentiment TEXT_FIELD_NAME : str name of the field containing comment text LABEL_FIELD_NAME : str name of the field containing label value POSITIVE_LABEL : int positive sentiment label NEGATIVE_LABEL : int negative sentiment label """ NAME = "imdb" URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" DATASET_DIR = os.path.join("imdb", "aclImdb") ARCHIVE_TYPE = "tar" TRAIN_DIR = "train" TEST_DIR = "test" POSITIVE_LABEL_DIR = "pos" NEGATIVE_LABEL_DIR = "neg" TEXT_FIELD_NAME = "text" LABEL_FIELD_NAME = "label" POSITIVE_LABEL = "positive" NEGATIVE_LABEL = "negative" def __init__(self, dir_path, fields): """ Dataset constructor. User should use static method get_dataset_splits rather than using directly constructor. Parameters ---------- dir_path : str path to the directory containing datasets fields : dict(str, Field) dictionary that maps field name to the field """ LargeResource( **{ LargeResource.RESOURCE_NAME: IMDB.NAME, LargeResource.ARCHIVE: IMDB.ARCHIVE_TYPE, LargeResource.URI: IMDB.URL, } ) examples = self._create_examples(dir_path=dir_path, fields=fields) super(IMDB, self).__init__(**{"examples": examples, "fields": fields}) @staticmethod def _create_examples(dir_path, fields): """ Method creates examples for imdb dataset. Examples are arranged in two folders, one for examples with positive sentiment and other with negative sentiment. One file in each folder represents one example. Parameters ---------- dir_path : str directory where files with examples are positioned fields : dict(str, Field) dictionary mapping field names to fields Returns ------- examples : list(Example) list of examples from given dir_path """ dir_pos_path = os.path.join(dir_path, IMDB.POSITIVE_LABEL_DIR) dir_neg_path = os.path.join(dir_path, IMDB.NEGATIVE_LABEL_DIR) examples = [] examples.extend( IMDB._create_labeled_examples(dir_pos_path, IMDB.POSITIVE_LABEL, fields) ) examples.extend( IMDB._create_labeled_examples(dir_neg_path, IMDB.NEGATIVE_LABEL, fields) ) return examples @staticmethod def _create_labeled_examples(dir_path, label, fields): """ Method creates examples for imdb dataset with given label. Examples are positioned in multiple files that are in one folder. Parameters ---------- dir_path : str file where files with examples are positioned label : int examples label fields : dict(str, Field) dictionary mapping field names to fields Returns ------- examples : list(Example) list of examples from given dir_path """ example_factory = ExampleFactory(fields) files_list = [ f for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) ] examples = [] for file_path in files_list: with open(file=os.path.join(dir_path, file_path), encoding="utf8") as fpr: data = {IMDB.TEXT_FIELD_NAME: fpr.read(), IMDB.LABEL_FIELD_NAME: label} examples.append(example_factory.from_dict(data)) return examples
[docs] @staticmethod def get_dataset_splits(fields=None): """ Method creates train and test dataset for Imdb dataset. Parameters ---------- fields : dict(str, Field), optional dictionary mapping field name to field, if not given method will use ```get_default_fields```. User should use default field names defined in class attributes. Returns ------- (train_dataset, test_dataset) : (Dataset, Dataset) tuple containing train dataset and test dataset """ data_location = os.path.join(LargeResource.BASE_RESOURCE_DIR, IMDB.DATASET_DIR) if not fields: fields = IMDB.get_default_fields() train_dataset = IMDB( dir_path=os.path.join(data_location, IMDB.TRAIN_DIR), fields=fields ) test_dataset = IMDB( dir_path=os.path.join(data_location, IMDB.TEST_DIR), fields=fields ) return (train_dataset, test_dataset)
[docs] @staticmethod def get_default_fields(): """ Method returns default Imdb fields: text and label. Returns ------- fields : dict(str, Field) Dictionary mapping field name to field. """ text = Field( name=IMDB.TEXT_FIELD_NAME, numericalizer=Vocab(), tokenizer="spacy", ) label = LabelField(name=IMDB.LABEL_FIELD_NAME, numericalizer=Vocab(specials=())) return {IMDB.TEXT_FIELD_NAME: text, IMDB.LABEL_FIELD_NAME: label}