fastNLP.io.loader.conll 源代码

r"""undocumented"""

__all__ = [
    "ConllLoader",
    "Conll2003Loader",
    "Conll2003NERLoader",
    "OntoNotesNERLoader",
    "CTBLoader",
    "CNNERLoader",
    "MsraNERLoader",
    "WeiboNERLoader",
    "PeopleDailyNERLoader"
]

import glob
import os
import random
import shutil
import time

from .loader import Loader
from ..file_reader import _read_conll
from ...core.const import Const
from ...core.dataset import DataSet
from ...core.instance import Instance


[文档]class ConllLoader(Loader): r""" ConllLoader支持读取的数据格式: 以空行隔开两个sample,除了分割行,每一行用空格或者制表符隔开不同的元素。如下例所示: Example:: # 文件中的内容 Nadim NNP B-NP B-PER Ladki NNP I-NP I-PER AL-AIN NNP B-NP B-LOC United NNP B-NP B-LOC Arab NNP I-NP I-LOC Emirates NNPS I-NP I-LOC 1996-12-06 CD I-NP O ... # 如果用以下的参数读取,返回的DataSet将包含raw_words和pos两个field, 这两个field的值分别取自于第0列与第1列 dataset = ConllLoader(headers=['raw_words', 'pos'], indexes=[0, 1])._load('/path/to/train.conll') # 如果用以下的参数读取,返回的DataSet将包含raw_words和ner两个field, 这两个field的值分别取自于第0列与第2列 dataset = ConllLoader(headers=['raw_words', 'ner'], indexes=[0, 3])._load('/path/to/train.conll') # 如果用以下的参数读取,返回的DataSet将包含raw_words, pos和ner三个field dataset = ConllLoader(headers=['raw_words', 'pos', 'ner'], indexes=[0, 1, 3])._load('/path/to/train.conll') ConllLoader返回的DataSet的field由传入的headers确定。 数据中以"-DOCSTART-"开头的行将被忽略,因为该符号在conll 2003中被用为文档分割符。 """
[文档] def __init__(self, headers, sep=None, indexes=None, dropna=True): r""" :param list headers: 每一列数据的名称,需为List or Tuple of str。``header`` 与 ``indexes`` 一一对应 :param list sep: 指定分隔符,默认为制表符 :param list indexes: 需要保留的数据列下标,从0开始。若为 ``None`` ,则所有列都保留。Default: ``None`` :param bool dropna: 是否忽略非法数据,若 ``False`` ,遇到非法数据时抛出 ``ValueError`` 。Default: ``True`` """ super(ConllLoader, self).__init__() if not isinstance(headers, (list, tuple)): raise TypeError( 'invalid headers: {}, should be list of strings'.format(headers)) self.headers = headers self.dropna = dropna self.sep=sep if indexes is None: self.indexes = list(range(len(self.headers))) else: if len(indexes) != len(headers): raise ValueError self.indexes = indexes
def _load(self, path): r""" 传入的一个文件路径,将该文件读入DataSet中,field由ConllLoader初始化时指定的headers决定。 :param str path: 文件的路径 :return: DataSet """ ds = DataSet() for idx, data in _read_conll(path,sep=self.sep, indexes=self.indexes, dropna=self.dropna): ins = {h: data[i] for i, h in enumerate(self.headers)} ds.append(Instance(**ins)) return ds
[文档]class Conll2003Loader(ConllLoader): r""" 用于读取conll2003任务的数据。数据的内容应该类似与以下的内容, 第一列为raw_words, 第二列为pos, 第三列为chunking,第四列为ner。 Example:: Nadim NNP B-NP B-PER Ladki NNP I-NP I-PER AL-AIN NNP B-NP B-LOC United NNP B-NP B-LOC Arab NNP I-NP I-LOC Emirates NNPS I-NP I-LOC 1996-12-06 CD I-NP O ... 返回的DataSet的内容为 .. csv-table:: 下面是Conll2003Loader加载后数据具备的结构。 :header: "raw_words", "pos", "chunk", "ner" "[Nadim, Ladki]", "[NNP, NNP]", "[B-NP, I-NP]", "[B-PER, I-PER]" "[AL-AIN, United, Arab, ...]", "[NNP, NNP, NNP, ...]", "[B-NP, B-NP, I-NP, ...]", "[B-LOC, B-LOC, I-LOC, ...]" "[...]", "[...]", "[...]", "[...]" """ def __init__(self): headers = [ 'raw_words', 'pos', 'chunk', 'ner', ] super(Conll2003Loader, self).__init__(headers=headers) def _load(self, path): r""" 传入的一个文件路径,将该文件读入DataSet中,field由ConllLoader初始化时指定的headers决定。 :param str path: 文件的路径 :return: DataSet """ ds = DataSet() for idx, data in _read_conll(path, indexes=self.indexes, dropna=self.dropna): doc_start = False for i, h in enumerate(self.headers): field = data[i] if str(field[0]).startswith('-DOCSTART-'): doc_start = True break if doc_start: continue ins = {h: data[i] for i, h in enumerate(self.headers)} ds.append(Instance(**ins)) return ds def download(self, output_dir=None): raise RuntimeError("conll2003 cannot be downloaded automatically.")
[文档]class Conll2003NERLoader(ConllLoader): r""" 用于读取conll2003任务的NER数据。每一行有4列内容,空行意味着隔开两个句子 支持读取的内容如下 Example:: Nadim NNP B-NP B-PER Ladki NNP I-NP I-PER AL-AIN NNP B-NP B-LOC United NNP B-NP B-LOC Arab NNP I-NP I-LOC Emirates NNPS I-NP I-LOC 1996-12-06 CD I-NP O ... 返回的DataSet的内容为 .. csv-table:: 下面是Conll2003Loader加载后数据具备的结构, target是BIO2编码 :header: "raw_words", "target" "[Nadim, Ladki]", "[B-PER, I-PER]" "[AL-AIN, United, Arab, ...]", "[B-LOC, B-LOC, I-LOC, ...]" "[...]", "[...]" """ def __init__(self): headers = [ 'raw_words', 'target', ] super().__init__(headers=headers, indexes=[0, 3]) def _load(self, path): r""" 传入的一个文件路径,将该文件读入DataSet中,field由ConllLoader初始化时指定的headers决定。 :param str path: 文件的路径 :return: DataSet """ ds = DataSet() for idx, data in _read_conll(path, indexes=self.indexes, dropna=self.dropna): doc_start = False for i, h in enumerate(self.headers): field = data[i] if str(field[0]).startswith('-DOCSTART-'): doc_start = True break if doc_start: continue ins = {h: data[i] for i, h in enumerate(self.headers)} ds.append(Instance(**ins)) if len(ds) == 0: raise RuntimeError("No data found {}.".format(path)) return ds def download(self): raise RuntimeError("conll2003 cannot be downloaded automatically.")
[文档]class OntoNotesNERLoader(ConllLoader): r""" 用以读取OntoNotes的NER数据,同时也是Conll2012的NER任务数据。将OntoNote数据处理为conll格式的过程可以参考 https://github.com/yhcc/OntoNotes-5.0-NER。OntoNoteNERLoader将取第4列和第11列的内容。 读取的数据格式为: Example:: bc/msnbc/00/msnbc_0000 0 0 Hi UH (TOP(FRAG(INTJ*) - - - Dan_Abrams * - bc/msnbc/00/msnbc_0000 0 1 everyone NN (NP*) - - - Dan_Abrams * - ... 返回的DataSet的内容为 .. csv-table:: :header: "raw_words", "target" "['Hi', 'everyone', '.']", "['O', 'O', 'O']" "['first', 'up', 'on', 'the', 'docket']", "['O', 'O', 'O', 'O', 'O']" "[...]", "[...]" """ def __init__(self): super().__init__(headers=[Const.RAW_WORD, Const.TARGET], indexes=[3, 10]) def _load(self, path: str): dataset = super()._load(path) def convert_to_bio(tags): bio_tags = [] flag = None for tag in tags: label = tag.strip("()*") if '(' in tag: bio_label = 'B-' + label flag = label elif flag: bio_label = 'I-' + flag else: bio_label = 'O' if ')' in tag: flag = None bio_tags.append(bio_label) return bio_tags def convert_word(words): converted_words = [] for word in words: word = word.replace('/.', '.') # 有些结尾的.是/.形式的 if not word.startswith('-'): converted_words.append(word) continue # 以下是由于这些符号被转义了,再转回来 tfrs = {'-LRB-': '(', '-RRB-': ')', '-LSB-': '[', '-RSB-': ']', '-LCB-': '{', '-RCB-': '}' } if word in tfrs: converted_words.append(tfrs[word]) else: converted_words.append(word) return converted_words dataset.apply_field(convert_word, field_name=Const.RAW_WORD, new_field_name=Const.RAW_WORD) dataset.apply_field(convert_to_bio, field_name=Const.TARGET, new_field_name=Const.TARGET) return dataset def download(self): raise RuntimeError("Ontonotes cannot be downloaded automatically, you can refer " "https://github.com/yhcc/OntoNotes-5.0-NER to download and preprocess.")
[文档]class CTBLoader(Loader): r""" 支持加载的数据应该具备以下格式, 其中第二列为词语,第四列为pos tag,第七列为依赖树的head,第八列为依赖树的label Example:: 1 印度 _ NR NR _ 3 nn _ _ 2 海军 _ NN NN _ 3 nn _ _ 3 参谋长 _ NN NN _ 5 nsubjpass _ _ 4 被 _ SB SB _ 5 pass _ _ 5 解职 _ VV VV _ 0 root _ _ 1 新华社 _ NR NR _ 7 dep _ _ 2 新德里 _ NR NR _ 7 dep _ _ 3 12月 _ NT NT _ 7 dep _ _ ... 读取之后DataSet具备的格式为 .. csv-table:: :header: "raw_words", "pos", "dep_head", "dep_label" "[印度, 海军, ...]", "[NR, NN, SB, ...]", "[3, 3, ...]", "[nn, nn, ...]" "[新华社, 新德里, ...]", "[NR, NR, NT, ...]", "[7, 7, 7, ...]", "[dep, dep, dep, ...]" "[...]", "[...]", "[...]", "[...]" """ def __init__(self): super().__init__() headers = [ 'raw_words', 'pos', 'dep_head', 'dep_label', ] indexes = [ 1, 3, 6, 7, ] self.loader = ConllLoader(headers=headers, indexes=indexes) def _load(self, path: str): dataset = self.loader._load(path) return dataset
[文档] def download(self): r""" 由于版权限制,不能提供自动下载功能。可参考 https://catalog.ldc.upenn.edu/LDC2013T21 :return: """ raise RuntimeError("CTB cannot be downloaded automatically.")
class CNNERLoader(Loader): def _load(self, path: str): r""" 支持加载形如以下格式的内容,一行两列,以空格隔开两个sample Example:: 我 O 们 O 变 O 而 O 以 O 书 O 会 O ... :param str path: 文件路径 :return: DataSet,包含raw_words列和target列 """ ds = DataSet() with open(path, 'r', encoding='utf-8') as f: raw_chars = [] target = [] for line in f: line = line.strip() if line: parts = line.split() if len(parts) == 1: # 网上下载的数据有一些列少tag,默认补充O parts.append('O') raw_chars.append(parts[0]) target.append(parts[1]) else: if raw_chars: ds.append(Instance(raw_chars=raw_chars, target=target)) raw_chars = [] target = [] return ds
[文档]class MsraNERLoader(CNNERLoader): r""" 读取MSRA-NER数据,数据中的格式应该类似与下列的内容 Example:: 把 O 欧 B-LOC 美 B-LOC 、 O 港 B-LOC 台 B-LOC 流 O 行 O 的 O 食 O ... 读取后的DataSet包含以下的field .. csv-table:: :header: "raw_chars", "target" "['把', '欧'] ", "['O', 'B-LOC']" "['美', '、']", "['B-LOC', 'O']" "[...]", "[...]" """ def __init__(self): super().__init__()
[文档] def download(self, dev_ratio: float = 0.1, re_download: bool = False) -> str: r""" 自动下载MSAR-NER的数据,如果你使用该数据,请引用 Gina-Anne Levow, 2006, The Third International Chinese Language Processing Bakeoff: Word Segmentation and Named Entity Recognition. 根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。下载完成后在output_dir中有train.conll, test.conll, dev.conll三个文件。 :param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。 :param bool re_download: 是否重新下载数据,以重新切分数据。 :return: str, 数据集的目录地址 :return: """ dataset_name = 'msra-ner' data_dir = self._get_dataset_path(dataset_name=dataset_name) modify_time = 0 for filepath in glob.glob(os.path.join(data_dir, '*')): modify_time = os.stat(filepath).st_mtime break if time.time() - modify_time > 1 and re_download: # 通过这种比较丑陋的方式判断一下文件是否是才下载的 shutil.rmtree(data_dir) data_dir = self._get_dataset_path(dataset_name=dataset_name) if not os.path.exists(os.path.join(data_dir, 'dev.conll')): if dev_ratio > 0: assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." try: with open(os.path.join(data_dir, 'train.conll'), 'r', encoding='utf-8') as f, \ open(os.path.join(data_dir, 'middle_file.conll'), 'w', encoding='utf-8') as f1, \ open(os.path.join(data_dir, 'dev.conll'), 'w', encoding='utf-8') as f2: lines = [] # 一个sample包含很多行 for line in f: line = line.strip() if line: lines.append(line) else: if random.random() < dev_ratio: f2.write('\n'.join(lines) + '\n\n') else: f1.write('\n'.join(lines) + '\n\n') lines.clear() os.remove(os.path.join(data_dir, 'train.conll')) os.renames(os.path.join(data_dir, 'middle_file.conll'), os.path.join(data_dir, 'train.conll')) finally: if os.path.exists(os.path.join(data_dir, 'middle_file.conll')): os.remove(os.path.join(data_dir, 'middle_file.conll')) return data_dir
[文档]class WeiboNERLoader(CNNERLoader): r""" 读取WeiboNER数据,数据中的格式应该类似与下列的内容 Example:: 老 B-PER.NOM 百 I-PER.NOM 姓 I-PER.NOM 心 O ... 读取后的DataSet包含以下的field .. csv-table:: :header: "raw_chars", "target" "['老', '百', '姓']", "['B-PER.NOM', 'I-PER.NOM', 'I-PER.NOM']" "['心']", "['O']" "[...]", "[...]" """ def __init__(self): super().__init__()
[文档] def download(self) -> str: r""" 自动下载Weibo-NER的数据,如果你使用了该数据,请引用 Nanyun Peng and Mark Dredze, 2015, Named Entity Recognition for Chinese Social Media with Jointly Trained Embeddings. :return: str """ dataset_name = 'weibo-ner' data_dir = self._get_dataset_path(dataset_name=dataset_name) return data_dir
[文档]class PeopleDailyNERLoader(CNNERLoader): r""" 支持加载的数据格式如下 Example:: 中 B-ORG 共 I-ORG 中 I-ORG 央 I-ORG 致 O 中 B-ORG ... 读取后的DataSet包含以下的field .. csv-table:: target列是基于BIO的编码方式 :header: "raw_chars", "target" "['中', '共', '中', '央']", "['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG']" "[...]", "[...]" """ def __init__(self): super().__init__() def download(self) -> str: dataset_name = 'peopledaily' data_dir = self._get_dataset_path(dataset_name=dataset_name) return data_dir