fastNLP.io.loader.conll 源代码

r"""undocumented"""

__all__ = [
    "ConllLoader",
    "Conll2003Loader",
    "Conll2003NERLoader",
    "OntoNotesNERLoader",
    "CTBLoader",
    "CNNERLoader",
    "MsraNERLoader",
    "WeiboNERLoader",
    "PeopleDailyNERLoader"
]

import glob
import os
import random
import shutil
import time

from .loader import Loader
from ..file_reader import _read_conll
from ...core.const import Const
from ...core.dataset import DataSet
from ...core.instance import Instance


[文档]class ConllLoader(Loader):
    r"""
    ConllLoader支持读取的数据格式: 以空行隔开两个sample，除了分割行，每一行用空格或者制表符隔开不同的元素。如下例所示:

    Example::

        # 文件中的内容
        Nadim NNP B-NP B-PER
        Ladki NNP I-NP I-PER

        AL-AIN NNP B-NP B-LOC
        United NNP B-NP B-LOC
        Arab NNP I-NP I-LOC
        Emirates NNPS I-NP I-LOC
        1996-12-06 CD I-NP O
        ...

        # 如果用以下的参数读取，返回的DataSet将包含raw_words和pos两个field, 这两个field的值分别取自于第0列与第1列
        dataset = ConllLoader(headers=['raw_words', 'pos'], indexes=[0, 1])._load('/path/to/train.conll')
        # 如果用以下的参数读取，返回的DataSet将包含raw_words和ner两个field, 这两个field的值分别取自于第0列与第2列
        dataset = ConllLoader(headers=['raw_words', 'ner'], indexes=[0, 3])._load('/path/to/train.conll')
        # 如果用以下的参数读取，返回的DataSet将包含raw_words, pos和ner三个field
        dataset = ConllLoader(headers=['raw_words', 'pos', 'ner'], indexes=[0, 1, 3])._load('/path/to/train.conll')

    ConllLoader返回的DataSet的field由传入的headers确定。

    数据中以"-DOCSTART-"开头的行将被忽略，因为该符号在conll 2003中被用为文档分割符。

    """
    
[文档]    def __init__(self, headers, sep=None, indexes=None, dropna=True):
        r"""
        
        :param list headers: 每一列数据的名称，需为List or Tuple  of str。``header`` 与 ``indexes`` 一一对应
        :param list sep: 指定分隔符，默认为制表符
        :param list indexes: 需要保留的数据列下标，从0开始。若为 ``None`` ，则所有列都保留。Default: ``None``
        :param bool dropna: 是否忽略非法数据，若 ``False`` ，遇到非法数据时抛出 ``ValueError`` 。Default: ``True``
        """
        super(ConllLoader, self).__init__()
        if not isinstance(headers, (list, tuple)):
            raise TypeError(
                'invalid headers: {}, should be list of strings'.format(headers))
        self.headers = headers
        self.dropna = dropna
        self.sep=sep
        if indexes is None:
            self.indexes = list(range(len(self.headers)))
        else:
            if len(indexes) != len(headers):
                raise ValueError
            self.indexes = indexes
    
    def _load(self, path):
        r"""
        传入的一个文件路径，将该文件读入DataSet中，field由ConllLoader初始化时指定的headers决定。

        :param str path: 文件的路径
        :return: DataSet
        """
        ds = DataSet()
        for idx, data in _read_conll(path,sep=self.sep, indexes=self.indexes, dropna=self.dropna):
            ins = {h: data[i] for i, h in enumerate(self.headers)}
            ds.append(Instance(**ins))
        return ds


[文档]class Conll2003Loader(ConllLoader):
    r"""
    用于读取conll2003任务的数据。数据的内容应该类似与以下的内容, 第一列为raw_words, 第二列为pos, 第三列为chunking，第四列为ner。

    Example::

        Nadim NNP B-NP B-PER
        Ladki NNP I-NP I-PER

        AL-AIN NNP B-NP B-LOC
        United NNP B-NP B-LOC
        Arab NNP I-NP I-LOC
        Emirates NNPS I-NP I-LOC
        1996-12-06 CD I-NP O
        ...

    返回的DataSet的内容为

    .. csv-table:: 下面是Conll2003Loader加载后数据具备的结构。
       :header: "raw_words", "pos", "chunk", "ner"

       "[Nadim, Ladki]", "[NNP, NNP]", "[B-NP, I-NP]", "[B-PER, I-PER]"
       "[AL-AIN, United, Arab, ...]", "[NNP, NNP, NNP, ...]", "[B-NP, B-NP, I-NP, ...]", "[B-LOC, B-LOC, I-LOC, ...]"
       "[...]", "[...]", "[...]", "[...]"

    """
    
    def __init__(self):
        headers = [
            'raw_words', 'pos', 'chunk', 'ner',
        ]
        super(Conll2003Loader, self).__init__(headers=headers)
    
    def _load(self, path):
        r"""
        传入的一个文件路径，将该文件读入DataSet中，field由ConllLoader初始化时指定的headers决定。

        :param str path: 文件的路径
        :return: DataSet
        """
        ds = DataSet()
        for idx, data in _read_conll(path, indexes=self.indexes, dropna=self.dropna):
            doc_start = False
            for i, h in enumerate(self.headers):
                field = data[i]
                if str(field[0]).startswith('-DOCSTART-'):
                    doc_start = True
                    break
            if doc_start:
                continue
            ins = {h: data[i] for i, h in enumerate(self.headers)}
            ds.append(Instance(**ins))
        return ds
    
    def download(self, output_dir=None):
        raise RuntimeError("conll2003 cannot be downloaded automatically.")


[文档]class Conll2003NERLoader(ConllLoader):
    r"""
    用于读取conll2003任务的NER数据。每一行有4列内容，空行意味着隔开两个句子

    支持读取的内容如下
    Example::

        Nadim NNP B-NP B-PER
        Ladki NNP I-NP I-PER

        AL-AIN NNP B-NP B-LOC
        United NNP B-NP B-LOC
        Arab NNP I-NP I-LOC
        Emirates NNPS I-NP I-LOC
        1996-12-06 CD I-NP O
        ...

    返回的DataSet的内容为

    .. csv-table:: 下面是Conll2003Loader加载后数据具备的结构, target是BIO2编码
       :header: "raw_words", "target"

       "[Nadim, Ladki]", "[B-PER, I-PER]"
       "[AL-AIN, United, Arab, ...]", "[B-LOC, B-LOC, I-LOC, ...]"
       "[...]",  "[...]"

    """
    
    def __init__(self):
        headers = [
            'raw_words', 'target',
        ]
        super().__init__(headers=headers, indexes=[0, 3])
    
    def _load(self, path):
        r"""
        传入的一个文件路径，将该文件读入DataSet中，field由ConllLoader初始化时指定的headers决定。

        :param str path: 文件的路径
        :return: DataSet
        """
        ds = DataSet()
        for idx, data in _read_conll(path, indexes=self.indexes, dropna=self.dropna):
            doc_start = False
            for i, h in enumerate(self.headers):
                field = data[i]
                if str(field[0]).startswith('-DOCSTART-'):
                    doc_start = True
                    break
            if doc_start:
                continue
            ins = {h: data[i] for i, h in enumerate(self.headers)}
            ds.append(Instance(**ins))
        if len(ds) == 0:
            raise RuntimeError("No data found {}.".format(path))
        return ds
    
    def download(self):
        raise RuntimeError("conll2003 cannot be downloaded automatically.")


[文档]class OntoNotesNERLoader(ConllLoader):
    r"""
    用以读取OntoNotes的NER数据，同时也是Conll2012的NER任务数据。将OntoNote数据处理为conll格式的过程可以参考
    https://github.com/yhcc/OntoNotes-5.0-NER。OntoNoteNERLoader将取第4列和第11列的内容。

    读取的数据格式为：

    Example::

        bc/msnbc/00/msnbc_0000   0   0          Hi   UH   (TOP(FRAG(INTJ*)  -   -   -    Dan_Abrams  *   -
        bc/msnbc/00/msnbc_0000   0   1    everyone   NN              (NP*)  -   -   -    Dan_Abrams  *   -
        ...

    返回的DataSet的内容为

    .. csv-table::
        :header: "raw_words", "target"

        "['Hi', 'everyone', '.']", "['O', 'O', 'O']"
        "['first', 'up', 'on', 'the', 'docket']", "['O', 'O', 'O', 'O', 'O']"
        "[...]", "[...]"

    """
    
    def __init__(self):
        super().__init__(headers=[Const.RAW_WORD, Const.TARGET], indexes=[3, 10])
    
    def _load(self, path: str):
        dataset = super()._load(path)
        
        def convert_to_bio(tags):
            bio_tags = []
            flag = None
            for tag in tags:
                label = tag.strip("()*")
                if '(' in tag:
                    bio_label = 'B-' + label
                    flag = label
                elif flag:
                    bio_label = 'I-' + flag
                else:
                    bio_label = 'O'
                if ')' in tag:
                    flag = None
                bio_tags.append(bio_label)
            return bio_tags
        
        def convert_word(words):
            converted_words = []
            for word in words:
                word = word.replace('/.', '.')  # 有些结尾的.是/.形式的
                if not word.startswith('-'):
                    converted_words.append(word)
                    continue
                # 以下是由于这些符号被转义了，再转回来
                tfrs = {'-LRB-': '(',
                        '-RRB-': ')',
                        '-LSB-': '[',
                        '-RSB-': ']',
                        '-LCB-': '{',
                        '-RCB-': '}'
                        }
                if word in tfrs:
                    converted_words.append(tfrs[word])
                else:
                    converted_words.append(word)
            return converted_words
        
        dataset.apply_field(convert_word, field_name=Const.RAW_WORD, new_field_name=Const.RAW_WORD)
        dataset.apply_field(convert_to_bio, field_name=Const.TARGET, new_field_name=Const.TARGET)
        
        return dataset
    
    def download(self):
        raise RuntimeError("Ontonotes cannot be downloaded automatically, you can refer "
                           "https://github.com/yhcc/OntoNotes-5.0-NER to download and preprocess.")


[文档]class CTBLoader(Loader):
    r"""
    支持加载的数据应该具备以下格式, 其中第二列为词语，第四列为pos tag，第七列为依赖树的head，第八列为依赖树的label

    Example::

        1       印度    _       NR      NR      _       3       nn      _       _
        2       海军    _       NN      NN      _       3       nn      _       _
        3       参谋长  _       NN      NN      _       5       nsubjpass       _       _
        4       被      _       SB      SB      _       5       pass    _       _
        5       解职    _       VV      VV      _       0       root    _       _

        1       新华社  _       NR      NR      _       7       dep     _       _
        2       新德里  _       NR      NR      _       7       dep     _       _
        3       １２月  _       NT      NT      _       7       dep     _       _
        ...

    读取之后DataSet具备的格式为

    .. csv-table::
        :header: "raw_words", "pos", "dep_head", "dep_label"

        "[印度, 海军, ...]", "[NR, NN, SB, ...]", "[3, 3, ...]", "[nn, nn, ...]"
        "[新华社, 新德里, ...]", "[NR, NR, NT, ...]", "[7, 7, 7, ...]", "[dep, dep, dep, ...]"
        "[...]", "[...]", "[...]", "[...]"

    """
    def __init__(self):
        super().__init__()
        headers = [
            'raw_words', 'pos', 'dep_head', 'dep_label',
        ]
        indexes = [
            1, 3, 6, 7,
        ]
        self.loader = ConllLoader(headers=headers, indexes=indexes)
    
    def _load(self, path: str):
        dataset = self.loader._load(path)
        return dataset

[文档]    def download(self):
        r"""
        由于版权限制，不能提供自动下载功能。可参考

        https://catalog.ldc.upenn.edu/LDC2013T21

        :return:
        """
        raise RuntimeError("CTB cannot be downloaded automatically.")


class CNNERLoader(Loader):
    def _load(self, path: str):
        r"""
        支持加载形如以下格式的内容，一行两列，以空格隔开两个sample

        Example::

            我 O
            们 O
            变 O
            而 O
            以 O
            书 O
            会 O
            ...

        :param str path: 文件路径
        :return: DataSet，包含raw_words列和target列
        """
        ds = DataSet()
        with open(path, 'r', encoding='utf-8') as f:
            raw_chars = []
            target = []
            for line in f:
                line = line.strip()
                if line:
                    parts = line.split()
                    if len(parts) == 1:  # 网上下载的数据有一些列少tag，默认补充O
                        parts.append('O')
                    raw_chars.append(parts[0])
                    target.append(parts[1])
                else:
                    if raw_chars:
                        ds.append(Instance(raw_chars=raw_chars, target=target))
                    raw_chars = []
                    target = []
        return ds


[文档]class MsraNERLoader(CNNERLoader):
    r"""
    读取MSRA-NER数据，数据中的格式应该类似与下列的内容

    Example::

        把	O
        欧	B-LOC

        美	B-LOC
        、	O

        港	B-LOC
        台	B-LOC

        流	O
        行	O

        的	O

        食	O

        ...

    读取后的DataSet包含以下的field

    .. csv-table::
        :header: "raw_chars", "target"

        "['把', '欧'] ", "['O', 'B-LOC']"
        "['美', '、']", "['B-LOC', 'O']"
        "[...]", "[...]"

    """
    
    def __init__(self):
        super().__init__()
    
[文档]    def download(self, dev_ratio: float = 0.1, re_download: bool = False) -> str:
        r"""
        自动下载MSAR-NER的数据，如果你使用该数据，请引用 Gina-Anne Levow, 2006, The Third International Chinese Language
        Processing Bakeoff: Word Segmentation and Named Entity Recognition.

        根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。下载完成后在output_dir中有train.conll, test.conll,
        dev.conll三个文件。

        :param float dev_ratio: 如果路径中没有dev集，从train划分多少作为dev的数据. 如果为0，则不划分dev。
        :param bool re_download: 是否重新下载数据，以重新切分数据。
        :return: str, 数据集的目录地址
        :return:
        """
        dataset_name = 'msra-ner'
        data_dir = self._get_dataset_path(dataset_name=dataset_name)
        modify_time = 0
        for filepath in glob.glob(os.path.join(data_dir, '*')):
            modify_time = os.stat(filepath).st_mtime
            break
        if time.time() - modify_time > 1 and re_download:  # 通过这种比较丑陋的方式判断一下文件是否是才下载的
            shutil.rmtree(data_dir)
            data_dir = self._get_dataset_path(dataset_name=dataset_name)
        
        if not os.path.exists(os.path.join(data_dir, 'dev.conll')):
            if dev_ratio > 0:
                assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)."
                try:
                    with open(os.path.join(data_dir, 'train.conll'), 'r', encoding='utf-8') as f, \
                            open(os.path.join(data_dir, 'middle_file.conll'), 'w', encoding='utf-8') as f1, \
                            open(os.path.join(data_dir, 'dev.conll'), 'w', encoding='utf-8') as f2:
                        lines = []  # 一个sample包含很多行
                        for line in f:
                            line = line.strip()
                            if line:
                                lines.append(line)
                            else:
                                if random.random() < dev_ratio:
                                    f2.write('\n'.join(lines) + '\n\n')
                                else:
                                    f1.write('\n'.join(lines) + '\n\n')
                                lines.clear()
                    os.remove(os.path.join(data_dir, 'train.conll'))
                    os.renames(os.path.join(data_dir, 'middle_file.conll'), os.path.join(data_dir, 'train.conll'))
                finally:
                    if os.path.exists(os.path.join(data_dir, 'middle_file.conll')):
                        os.remove(os.path.join(data_dir, 'middle_file.conll'))
        
        return data_dir


[文档]class WeiboNERLoader(CNNERLoader):
    r"""
    读取WeiboNER数据，数据中的格式应该类似与下列的内容

    Example::

        老	B-PER.NOM
        百	I-PER.NOM
        姓	I-PER.NOM

        心	O

        ...

        读取后的DataSet包含以下的field

        .. csv-table::

            :header: "raw_chars", "target"

            "['老', '百', '姓']", "['B-PER.NOM', 'I-PER.NOM', 'I-PER.NOM']"
            "['心']", "['O']"
            "[...]", "[...]"

        """
    def __init__(self):
        super().__init__()
    
[文档]    def download(self) -> str:
        r"""
        自动下载Weibo-NER的数据，如果你使用了该数据，请引用 Nanyun Peng and Mark Dredze, 2015, Named Entity Recognition for
        Chinese Social Media with Jointly Trained Embeddings.

        :return: str
        """
        dataset_name = 'weibo-ner'
        data_dir = self._get_dataset_path(dataset_name=dataset_name)
        
        return data_dir


[文档]class PeopleDailyNERLoader(CNNERLoader):
    r"""
    支持加载的数据格式如下

    Example::

        中 B-ORG
        共 I-ORG
        中 I-ORG
        央 I-ORG

        致 O
        中 B-ORG
        ...

    读取后的DataSet包含以下的field

    .. csv-table:: target列是基于BIO的编码方式
        :header: "raw_chars", "target"

        "['中', '共', '中', '央']", "['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG']"
        "[...]", "[...]"

    """
    
    def __init__(self):
        super().__init__()
    
    def download(self) -> str:
        dataset_name = 'peopledaily'
        data_dir = self._get_dataset_path(dataset_name=dataset_name)
        
        return data_dir