Source code for openprompt.data_utils.typing_dataset

"""
This file contains the logic for loading data for all typing tasks.
# TODO license
"""

import os
import json, csv
from abc import ABC, abstractmethod
from collections import defaultdict, Counter
from typing import *

from transformers.tokenization_utils import SPECIAL_TOKENS_MAP_FILE

from openprompt.utils.logging import logger

from openprompt.data_utils.utils import InputExample
from openprompt.data_utils.data_processor import DataProcessor



[docs]class FewNERDProcessor(DataProcessor):
    """
    `Few-NERD <https://ningding97.github.io/fewnerd/>`_ a large-scale, fine-grained manually annotated named entity recognition dataset

    It was released together with `Few-NERD: Not Only a Few-shot NER Dataset (Ning Ding et al. 2021) <https://arxiv.org/pdf/2105.07464.pdf>`_

    Examples:

    ..  code-block:: python

        from openprompt.data_utils.typing_dataset import PROCESSORS

        base_path = "datasets/Typing"

        dataset_name = "FewNERD"
        dataset_path = os.path.join(base_path, dataset_name)
        processor = PROCESSORS[dataset_name.lower()]()
        train_dataset = processor.get_train_examples(dataset_path)
        dev_dataset = processor.get_dev_examples(dataset_path)
        test_dataset = processor.get_test_examples(dataset_path)

        assert processor.get_num_labels() == 66
        assert processor.get_labels() == [
            "person-actor", "person-director", "person-artist/author", "person-athlete", "person-politician", "person-scholar", "person-soldier", "person-other",
            "organization-showorganization", "organization-religion", "organization-company", "organization-sportsteam", "organization-education", "organization-government/governmentagency", "organization-media/newspaper", "organization-politicalparty", "organization-sportsleague", "organization-other",
            "location-GPE", "location-road/railway/highway/transit", "location-bodiesofwater", "location-park", "location-mountain", "location-island", "location-other",
            "product-software", "product-food", "product-game", "product-ship", "product-train", "product-airplane", "product-car", "product-weapon", "product-other",
            "building-theater", "building-sportsfacility", "building-airport", "building-hospital", "building-library", "building-hotel", "building-restaurant", "building-other",
            "event-sportsevent", "event-attack/battle/war/militaryconflict", "event-disaster", "event-election", "event-protest", "event-other",
            "art-music", "art-writtenart", "art-film", "art-painting", "art-broadcastprogram", "art-other",
            "other-biologything", "other-chemicalthing", "other-livingthing", "other-astronomything", "other-god", "other-law", "other-award", "other-disease", "other-medical", "other-language", "other-currency", "other-educationaldegree",
        ]
        assert dev_dataset[0].text_a == "The final stage in the development of the Skyfox was the production of a model with tricycle landing gear to better cater for the pilot training market ."
        assert dev_dataset[0].meta["entity"] == "Skyfox"
        assert dev_dataset[0].label == 30
    """
    def __init__(self):
        super().__init__()
        self.labels = [
            "person-actor", "person-director", "person-artist/author", "person-athlete", "person-politician", "person-scholar", "person-soldier", "person-other",
            "organization-showorganization", "organization-religion", "organization-company", "organization-sportsteam", "organization-education", "organization-government/governmentagency", "organization-media/newspaper", "organization-politicalparty", "organization-sportsleague", "organization-other",
            "location-GPE", "location-road/railway/highway/transit", "location-bodiesofwater", "location-park", "location-mountain", "location-island", "location-other",
            "product-software", "product-food", "product-game", "product-ship", "product-train", "product-airplane", "product-car", "product-weapon", "product-other",
            "building-theater", "building-sportsfacility", "building-airport", "building-hospital", "building-library", "building-hotel", "building-restaurant", "building-other",
            "event-sportsevent", "event-attack/battle/war/militaryconflict", "event-disaster", "event-election", "event-protest", "event-other",
            "art-music", "art-writtenart", "art-film", "art-painting", "art-broadcastprogram", "art-other",
            "other-biologything", "other-chemicalthing", "other-livingthing", "other-astronomything", "other-god", "other-law", "other-award", "other-disease", "other-medical", "other-language", "other-currency", "other-educationaldegree",
        ]

    def get_examples(self, data_dir, split):
        path = os.path.join(data_dir, "supervised/{}.txt".format(split))
        with open(path, encoding='utf8') as f:
            data = FewNERDProcessor.load_data(f)

            examples = []

            for idx, (xs, ys, spans) in enumerate(data):
                for span in spans:
                    text_a = " ".join(xs)
                    meta = {
                        "entity": " ".join(xs[span[0]: span[1]+1])
                    }
                    example = InputExample(guid=str(idx), text_a=text_a, meta=meta, label=self.get_label_id(ys[span[0]][2:]))
                    examples.append(example)

            return examples

    @staticmethod
    def load_data(file):
        data = []
        xs = []
        ys = []
        spans = []

        for line in file.readlines():
            pair = line.split()
            if pair == []:
                if xs != []:
                    data.append((xs, ys, spans))
                xs = []
                ys = []
                spans = []
            else:
                xs.append(pair[0])

                tag = pair[-1]
                if tag != 'O':
                    if len(ys) == 0 or tag != ys[-1][2:]:
                        tag = 'B-' + tag
                        spans.append([len(ys), len(ys)])
                    else:
                        tag = 'I-' + tag
                        spans[-1][-1] = len(ys)
                ys.append(tag)
        return data

PROCESSORS = {
    "fewnerd": FewNERDProcessor,
    # "conll2003": Conll2003Processor,
    # "ontonotes5_0": OntoNotes5_0Processor,
    # "bbn": BBNProcessor,
}