如何使用交错的 Huggingface 数据集创建 PyTorch 数据加载器?

2024-02-15

当我交错数据集、获取标记化批次、将批次提供给 pytorch 数据加载器时,我收到错误:

# -*- coding: utf-8 -*-
"""issues with dataloader and custom data sets

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1sbs95as_66mtK9VK_vbaE9gLE-Tjof1-
"""

!pip install datasets
!pip install pytorch
!pip install transformers

token = None
batch_size = 10
from datasets import load_dataset
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
if tokenizer.pad_token_id is None:
  tokenizer.pad_token = tokenizer.eos_token
probe_network = GPT2LMHeadModel.from_pretrained("gpt2")
device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")
probe_network = probe_network.to(device)

# -- Get batch from dataset
from datasets import load_dataset
# path, name = 'brando/debug1_af', 'debug1_af'
path, name = 'brando/debug0_af', 'debug0_af'
remove_columns = []
dataset = load_dataset(path, name, streaming=True, split="train", token=token).with_format("torch")
print(f'{dataset=}')
batch = dataset.take(batch_size)
# print(f'{next(iter(batch))=}')

# - Prepare functions to tokenize batch
def preprocess(examples):  # gets the raw text batch according to the specific names in table in data set & tokenize
    return tokenizer(examples["link"], padding="max_length", max_length=128, truncation=True, return_tensors="pt")
def map(batch):  # apply preprocess to batch to all examples in batch represented as a dataset
    return batch.map(preprocess, batched=True, remove_columns=remove_columns)
tokenized_batch = batch.map(preprocess, batched=True, remove_columns=remove_columns)
tokenized_batch = map(batch)
# print(f'{next(iter(tokenized_batch))=}')

from torch.utils.data import Dataset, DataLoader, SequentialSampler
dataset = tokenized_batch
print(f'{type(dataset)=}')
print(f'{dataset.__class__=}')
print(f'{isinstance(dataset, Dataset)=}')
# for i, d in enumerate(dataset):
#     assert isinstance(d, dict)
#     # dd = dataset[i]
#     # assert isinstance(dd, dict)
loader_opts = {}
classifier_opts = {}
# data_loader = DataLoader(dataset, shuffle=False, batch_size=loader_opts.get('batch_size', 1),
#                         num_workers=loader_opts.get('num_workers', 0), drop_last=False, sampler=SequentialSampler(range(512))  )
data_loader = DataLoader(dataset, shuffle=False, batch_size=loader_opts.get('batch_size', 1),
                    num_workers=loader_opts.get('num_workers', 0), drop_last=False, sampler=None)
print(f'{iter(data_loader)=}')
print(f'{next(iter(data_loader))=}')
print('Done\a')

有错误:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in collate(batch, collate_fn_map)
    126         try:
--> 127             return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
    128         except TypeError:

9 frames
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in collate(batch, collate_fn_map)
    148                 return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]
    149 
--> 150     raise TypeError(default_collate_err_msg_format.format(elem_type))
    151 
    152 

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>

为什么?为什么单个数据集 c4 和 wiki-text 不会给出这个错误?仅交错数据集?

理想情况下,我不想编写自己的 collat​​e_function。

  • colab: https://colab.research.google.com/drive/1sbs95as_66mtK9VK_vbaE9gLE-Tjof1-?usp=sharing https://colab.research.google.com/drive/1sbs95as_66mtK9VK_vbaE9gLE-Tjof1-?usp=sharing
  • 有关的:如何创建一个带有自定义拥抱面部数据集的 pytorch 数据加载器而不出现错误? https://stackoverflow.com/questions/76872115/how-does-one-create-a-pytorch-data-loader-with-a-custom-hugging-face-data-set-wi
  • 高频讨论:https://discuss.huggingface.co/t/how-does-one-create-a-pytoch-data-loader-using-an-interleaved-hugging-face-dataset/50320 https://discuss.huggingface.co/t/how-does-one-create-a-pytoch-data-loader-using-an-interleaved-hugging-face-dataset/50320

由于某种原因,当数据集相交时,整理函数会感到困惑,因为有额外的行,所以它不知道如何合并内容?我修复它的方法是只保留我想要的列:

    # -- Get data set
    # remove_columns = ['text', 'timestamp', 'url']
    keep_col = ['text']
    # keep the strings in dataaset.column_names that intersect with keep_col str list, one liner
    print('-- interleaving datasets')
    datasets = [load_dataset(path, name, streaming=True, split="train").with_format("torch") for path, name in zip(path, name)]
    [print(f'{dataset.description=}') for dataset in datasets]
    dataset = interleave_datasets(datasets, probabilities)
    remove_columns = [col for col in dataset.column_names if col not in keep_col]
    print(f'{dataset=}')
    batch = dataset.take(batch_size)

而且如果您知道所需的文本字段(假设"text"由于它很常见):

    def collate_tokenize(data):
        print(f'{data[0]=}')
        text_batch = [element["text"] for element in data]
        tokenized = tokenizer(text_batch, padding='longest', truncation=True, return_tensors='pt')
        return tokenized
    data_loader = DataLoader(tokenized_batch, shuffle=False, batch_size=8, num_workers=0, drop_last=False, collate_fn=collate_tokenize)
    # data_loader = DataLoader(tokenized_batch, shuffle=False, batch_size=8, num_workers=0, drop_last=False)
    # num_batches = len(list(data_loader))
    batch = next(iter(data_loader))
    print(f'{batch=}')
    print('Done!\a')

完整代码:

def test_interleaved_data_set_2_data_loader():
    """ https://colab.research.google.com/drive/1QWDhA6Q64qijXYnwIGn63Aq9Eg5qt8tQ#scrollTo=Wjyy6QYimvIm """
    remove_columns = []
    # -- Get probe network
    from datasets import load_dataset
    import torch
    from transformers import GPT2Tokenizer, GPT2LMHeadModel

    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token
    probe_network = GPT2LMHeadModel.from_pretrained("gpt2")
    device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")
    probe_network = probe_network.to(device)

    from datasets import interleave_datasets

    path, name = ['c4', 'wikitext'], ['en', 'wikitext-103-v1']
    probabilities = [1.0/len(path)] * len(path)
    batch_size = 512

    # -- Get data set
    # remove_columns = ['text', 'timestamp', 'url']
    keep_col = ['text']
    # keep the strings in dataaset.column_names that intersect with keep_col str list, one liner
    print('-- interleaving datasets')
    datasets = [load_dataset(path, name, streaming=True, split="train").with_format("torch") for path, name in zip(path, name)]
    [print(f'{dataset.description=}') for dataset in datasets]
    dataset = interleave_datasets(datasets, probabilities)
    remove_columns = [col for col in dataset.column_names if col not in keep_col]
    print(f'{dataset=}')
    batch = dataset.take(batch_size)

    # - Prepare functions to tokenize batch
    def preprocess(examples):
        return tokenizer(examples["text"], padding="max_length", max_length=128, truncation=True, return_tensors="pt")
    def map(batch):
        return batch.map(preprocess, batched=True, remove_columns=remove_columns)
    # tokenized_batch = batch.map(preprocess, batched=True, remove_columns=remove_columns)
    tokenized_batch = map(batch)
    print(f'{next(iter(tokenized_batch))=}')

    # -- Get data loader
    from torch.utils.data import DataLoader, Dataset

    # def collate_tokenize(data):
    #     print(f'{data[0]=}')
    #     text_batch = [element["text"] for element in data]
    #     tokenized = tokenizer(text_batch, padding='longest', truncation=True, return_tensors='pt')
    #     return tokenized
    # data_loader = DataLoader(tokenized_batch, shuffle=False, batch_size=8, num_workers=0, drop_last=False, collate_fn=collate_tokenize)
    data_loader = DataLoader(tokenized_batch, shuffle=False, batch_size=8, num_workers=0, drop_last=False)
    # num_batches = len(list(data_loader))
    batch = next(iter(data_loader))
    print(f'{batch=}')
    print('Done!\a')
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)

如何使用交错的 Huggingface 数据集创建 PyTorch 数据加载器? 的相关文章

随机推荐