With tensorflow 2.x.x
您可以使用的数据集 APItf.data.Dataset.from_generator
从生成器函数创建数据集。该生成器函数将通过 numpy memap 完成读取工作。
下面的代码创建一个虚拟数据文件,然后从磁盘上的文件中一次读取一个示例。它可以轻松更新以读取多个示例以增加 IO 吞吐量(如果您需要在下面的代码示例中实现这一点,请告诉我)。
# imports
import numpy as np
import pathlib
import tensorflow as tf
# create huge numpy array and save it to disk
file = pathlib.Path("huge_data.npy")
examples = 5000
example_shape = (256, 256)
huge_data_shape = (examples, *example_shape)
huge_data_dtype = np.float64
# create file if does not exist
if not file.is_file():
print("creating file with random data and saving to disk")
numpy_data = np.random.rand(*huge_data_shape).astype(huge_data_dtype)
np.save(file, numpy_data)
# memmap the file
numpy_data_memmap = np.load(file, mmap_mode='r')
# generator function
def data_generator():
return iter(numpy_data_memmap)
# create tf dataset from generator fn
dataset = tf.data.Dataset.from_generator(
generator=data_generator,
output_types=huge_data_dtype,
output_shapes=example_shape,
)
# consume huge dataset
for i, ex in enumerate(dataset):
print(i, ex.shape, ex.dtype)
Output:
0 (256, 256) <dtype: 'float64'>
1 (256, 256) <dtype: 'float64'>
2 (256, 256) <dtype: 'float64'>
3 (256, 256) <dtype: 'float64'>
...
4995 (256, 256) <dtype: 'float64'>
4996 (256, 256) <dtype: 'float64'>
4997 (256, 256) <dtype: 'float64'>
4998 (256, 256) <dtype: 'float64'>
4999 (256, 256) <dtype: 'float64'>