问题描述
在使用由自定义数据集制作的 Pytorch 数据加载器进行神经网络训练期间,我遇到了奇怪的行为。数据加载器设置为workers=4、pin_memory=False。
大多数时候,训练都顺利完成。
有时,训练会随机停止,并出现以下错误:
- OSError: [Errno 9] 错误的文件描述符
- EOFError
看起来错误是在创建套接字以访问数据加载器元素期间发生的。
当我将工作人员数量设置为 0 时,错误消失,但我需要通过多处理来加速我的训练。
错误的根源可能是什么?谢谢 !
python 和库的版本
Python 3.9.12、Pytorch 1.11.0+cu102
编辑:错误仅发生在集群上
错误文件的输出
Traceback (most recent call last):
File "/my_directory/.conda/envs/geoseg/lib/python3.9/multiprocessing/resource_sharer.py", line 145, in _serve
Epoch 17: 52%|█████▏ | 253/486 [01:00<00:55, 4.18it/s, loss=1.73]
Traceback (most recent call last):
File "/my_directory/bench/run_experiments.py", line 251, in <module>
send(conn, destination_pid)
File "/my_directory/.conda/envs/geoseg/lib/python3.9/multiprocessing/resource_sharer.py", line 50, in send
reduction.send_handle(conn, new_fd, pid)
File "/my_directory/.conda/envs/geoseg/lib/python3.9/multiprocessing/reduction.py", line 183, in send_handle
with socket.fromfd(conn.fileno(), socket.AF_UNIX, socket.SOCK_STREAM) as s:
File "/my_directory/.conda/envs/geoseg/lib/python3.9/socket.py", line 545, in fromfd
return socket(family, type, proto, nfd)
File "/my_directory/.conda/envs/geoseg/lib/python3.9/socket.py", line 232, in __init__
_socket.socket.__init__(self, family, type, proto, fileno)
OSError: [Errno 9] Bad file descriptor
main(args)
File "/my_directory/bench/run_experiments.py", line 183, in main
run_experiments(args, save_path)
File "/my_directory/bench/run_experiments.py", line 70, in run_experiments
) = run_algorithm(algorithm_params[j], mp[j], ss, dataset)
File "/my_directorybench/algorithms.py", line 38, in run_algorithm
data = es(mp,search_space, dataset, **ps)
File "/my_directorybench/algorithms.py", line 151, in es
data = ss.generate_random_dataset(mp,
File "/my_directorybench/architectures.py", line 241, in generate_random_dataset
arch_dict = self.query_arch(
File "/my_directory/bench/architectures.py", line 71, in query_arch
train_losses, val_losses, model = meta_net.get_val_loss(
File "/my_directory/bench/meta_neural_net.py", line 50, in get_val_loss
return self.training(
File "/my_directorybench/meta_neural_net.py", line 155, in training
train_loss = self.train_step(model, device, train_loader, epoch)
File "/my_directory/bench/meta_neural_net.py", line 179, in train_step
for batch_idx, mini_batch in enumerate(pbar):
File "/my_directory/.conda/envs/geoseg/lib/python3.9/site-packages/tqdm/std.py", line 1195, in __iter__
for obj in iterable:
File "/my_directory/.local/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 530, in __next__
data = self._next_data()
File "/my_directory/.local/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1207, in _next_data
idx, data = self._get_data()
File "/my_directory/.local/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1173, in _get_data
success, data = self._try_get_data()
File "/my_directory/.local/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1011, in _try_get_data
data = self._data_queue.get(timeout=timeout)
File "/my_directory/.conda/envs/geoseg/lib/python3.9/multiprocessing/queues.py", line 122, in get
return _ForkingPickler.loads(res)
File "/my_directory/.local/lib/python3.9/site-packages/torch/multiprocessing/reductions.py", line 295, in rebuild_storage_fd
fd = df.detach()
File "/my_directory/.conda/envs/geoseg/lib/python3.9/multiprocessing/resource_sharer.py", line 58, in detach
return reduction.recv_handle(conn)
File "/my_directory/.conda/envs/geoseg/lib/python3.9/multiprocessing/reduction.py", line 189, in recv_handle
return recvfds(s, 1)[0]
File "/my_directory/.conda/envs/geoseg/lib/python3.9/multiprocessing/reduction.py", line 159, in recvfds
raise EOFError
EOFError
编辑:访问数据的方式
from PIL import Image
from torch.utils.data import DataLoader
# extract of code of dataset
class Dataset():
def __init__(self,image_files,mask_files):
self.image_files = image_files
self.mask_files = mask_files
def __getitem__(self, idx):
img = Image.open(self.image_files[idx]).convert('RGB')
mask=Image.open(self.mask_files[idx]).convert('L')
return img, mask
# extract of code of trainloader
train_loader = DataLoader(
dataset=train_dataset,
batch_size=4,
num_workers=4,
pin_memory=False,
shuffle=True,
drop_last=True,
persistent_workers=False,
)