Source code for openprompt.data_utils.typing_dataset

"""
This file contains the logic for loading data for all typing tasks.
# TODO license
"""

import os
import json, csv
from abc import ABC, abstractmethod
from collections import defaultdict, Counter
from typing import *

from transformers.tokenization_utils import SPECIAL_TOKENS_MAP_FILE

from openprompt.utils.logging import logger

from openprompt.data_utils.utils import InputExample
from openprompt.data_utils.data_processor import DataProcessor



[docs]class FewNERDProcessor(DataProcessor): """ `Few-NERD <https://ningding97.github.io/fewnerd/>`_ a large-scale, fine-grained manually annotated named entity recognition dataset It was released together with `Few-NERD: Not Only a Few-shot NER Dataset (Ning Ding et al. 2021) <https://arxiv.org/pdf/2105.07464.pdf>`_ Examples: .. code-block:: python from openprompt.data_utils.typing_dataset import PROCESSORS base_path = "datasets/Typing" dataset_name = "FewNERD" dataset_path = os.path.join(base_path, dataset_name) processor = PROCESSORS[dataset_name.lower()]() train_dataset = processor.get_train_examples(dataset_path) dev_dataset = processor.get_dev_examples(dataset_path) test_dataset = processor.get_test_examples(dataset_path) assert processor.get_num_labels() == 66 assert processor.get_labels() == [ "person-actor", "person-director", "person-artist/author", "person-athlete", "person-politician", "person-scholar", "person-soldier", "person-other", "organization-showorganization", "organization-religion", "organization-company", "organization-sportsteam", "organization-education", "organization-government/governmentagency", "organization-media/newspaper", "organization-politicalparty", "organization-sportsleague", "organization-other", "location-GPE", "location-road/railway/highway/transit", "location-bodiesofwater", "location-park", "location-mountain", "location-island", "location-other", "product-software", "product-food", "product-game", "product-ship", "product-train", "product-airplane", "product-car", "product-weapon", "product-other", "building-theater", "building-sportsfacility", "building-airport", "building-hospital", "building-library", "building-hotel", "building-restaurant", "building-other", "event-sportsevent", "event-attack/battle/war/militaryconflict", "event-disaster", "event-election", "event-protest", "event-other", "art-music", "art-writtenart", "art-film", "art-painting", "art-broadcastprogram", "art-other", "other-biologything", "other-chemicalthing", "other-livingthing", "other-astronomything", "other-god", "other-law", "other-award", "other-disease", "other-medical", "other-language", "other-currency", "other-educationaldegree", ] assert dev_dataset[0].text_a == "The final stage in the development of the Skyfox was the production of a model with tricycle landing gear to better cater for the pilot training market ." assert dev_dataset[0].meta["entity"] == "Skyfox" assert dev_dataset[0].label == 30 """ def __init__(self): super().__init__() self.labels = [ "person-actor", "person-director", "person-artist/author", "person-athlete", "person-politician", "person-scholar", "person-soldier", "person-other", "organization-showorganization", "organization-religion", "organization-company", "organization-sportsteam", "organization-education", "organization-government/governmentagency", "organization-media/newspaper", "organization-politicalparty", "organization-sportsleague", "organization-other", "location-GPE", "location-road/railway/highway/transit", "location-bodiesofwater", "location-park", "location-mountain", "location-island", "location-other", "product-software", "product-food", "product-game", "product-ship", "product-train", "product-airplane", "product-car", "product-weapon", "product-other", "building-theater", "building-sportsfacility", "building-airport", "building-hospital", "building-library", "building-hotel", "building-restaurant", "building-other", "event-sportsevent", "event-attack/battle/war/militaryconflict", "event-disaster", "event-election", "event-protest", "event-other", "art-music", "art-writtenart", "art-film", "art-painting", "art-broadcastprogram", "art-other", "other-biologything", "other-chemicalthing", "other-livingthing", "other-astronomything", "other-god", "other-law", "other-award", "other-disease", "other-medical", "other-language", "other-currency", "other-educationaldegree", ] def get_examples(self, data_dir, split): path = os.path.join(data_dir, "supervised/{}.txt".format(split)) with open(path, encoding='utf8') as f: data = FewNERDProcessor.load_data(f) examples = [] for idx, (xs, ys, spans) in enumerate(data): for span in spans: text_a = " ".join(xs) meta = { "entity": " ".join(xs[span[0]: span[1]+1]) } example = InputExample(guid=str(idx), text_a=text_a, meta=meta, label=self.get_label_id(ys[span[0]][2:])) examples.append(example) return examples @staticmethod def load_data(file): data = [] xs = [] ys = [] spans = [] for line in file.readlines(): pair = line.split() if pair == []: if xs != []: data.append((xs, ys, spans)) xs = [] ys = [] spans = [] else: xs.append(pair[0]) tag = pair[-1] if tag != 'O': if len(ys) == 0 or tag != ys[-1][2:]: tag = 'B-' + tag spans.append([len(ys), len(ys)]) else: tag = 'I-' + tag spans[-1][-1] = len(ys) ys.append(tag) return data
PROCESSORS = { "fewnerd": FewNERDProcessor, # "conll2003": Conll2003Processor, # "ontonotes5_0": OntoNotes5_0Processor, # "bbn": BBNProcessor, }