fastNLP.io.loader.qa 源代码

r"""
该文件中的Loader主要用于读取问答式任务的数据

"""


from . import Loader
import json
from ...core import DataSet, Instance

__all__ = ['CMRC2018Loader']


[文档]class CMRC2018Loader(Loader): r""" 请直接使用从fastNLP下载的数据进行处理。该数据集未提供测试集,测试需要通过上传到对应的系统进行评测 读取之后训练集DataSet将具备以下的内容,每个问题的答案只有一个 .. csv-table:: :header:"title", "context", "question", "answers", "answer_starts", "id" "范廷颂", "范廷颂枢机(,),圣名保禄·若瑟()...", "范廷颂是什么时候被任为主教的?", ["1963年"], ["30"], "TRAIN_186_QUERY_0" "范廷颂", "范廷颂枢机(,),圣名保禄·若瑟()...", "1990年,范廷颂担任什么职务?", ["1990年被擢升为天..."], ["41"],"TRAIN_186_QUERY_1" "...", "...", "...","...", ".", "..." 其中title是文本的标题,多条记录可能是相同的title;id是该问题的id,具备唯一性 验证集DataSet将具备以下的内容,每个问题的答案可能有三个(有时候只是3个重复的答案) .. csv-table:: :header: "title", "context", "question", "answers", "answer_starts", "id" "战国无双3", "《战国无双3》()是由光荣和ω-force开发...", "《战国无双3》是由哪两个公司合作开发的?", "['光荣和ω-force', '光荣和ω-force', '光荣和ω-force']", "[30, 30, 30]", "DEV_0_QUERY_0" "战国无双3", "《战国无双3》()是由光荣和ω-force开发...", "男女主角亦有专属声优这一模式是由谁改编的?", "['村雨城', '村雨城', '任天堂游戏谜之村雨城']", "[226, 226, 219]", "DEV_0_QUERY_1" "...", "...", "...","...", ".", "..." 其中answer_starts是从0开始的index。例如"我来自a复旦大学?",其中"复"的开始index为4。另外"Russell评价说"中的说的index为9, 因为 英文和数字都直接按照character计量的。 """ def __init__(self): super().__init__() def _load(self, path: str) -> DataSet: with open(path, 'r', encoding='utf-8') as f: data = json.load(f)['data'] ds = DataSet() for entry in data: title = entry['title'] para = entry['paragraphs'][0] context = para['context'] qas = para['qas'] for qa in qas: question = qa['question'] ans = qa['answers'] answers = [] answer_starts = [] id = qa['id'] for an in ans: answers.append(an['text']) answer_starts.append(an['answer_start']) ds.append(Instance(title=title, context=context, question=question, answers=answers, answer_starts=answer_starts,id=id)) return ds
[文档] def download(self) -> str: r""" 如果您使用了本数据,请引用A Span-Extraction Dataset for Chinese Machine Reading Comprehension. Yiming Cui, Ting Liu, etc. :return: """ output_dir = self._get_dataset_path('cmrc2018') return output_dir