You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

44 lines
1.5 KiB

import os
import paddle as paddle
from multiprocessing import cpu_count
import numpy as np
from PIL import Image
data_path='./data_set'
train_data='./train_data.list'
test_data='./test_data.list'
characters_folders=os.listdir(data_path)
if(os.path.exists('./train_data.list')):
os.remove('./train_data.list')
if(os.path.exists('./test_data.list')):
os.remove('./test_data.list')
for characters_folder in characters_folders:
with open(train_data,'a') as f_train:
with open(test_data,'a') as f_test:
character_imgs = os.listdir(os.path.join(data_path,characters_folder))
count = 0
for img in character_imgs:
filePath = data_path+"/"+characters_folder+"/"+img
if count%10==0:
f_test.write(filePath+"\t"+characters_folder+"\n")
else:
f_train.write(filePath+"\t"+characters_folder+"\n")
count+=1
def data_mapper(sample):
img, label = sample
img = Image.open(img)
img = img.resize((100, 100), Image.ANTIALIAS)
img = np.array(img).astype('float32')
img = img.transpose((2, 0, 1))
img = img/255.0
return img, label
def data_reader(data_list_path):
def reader():
with open(data_list_path, 'r') as f:
lines = f.readlines()
for line in lines:
img, label = line.split('\t')
yield img, int(label)
return paddle.reader.xmap_readers(data_mapper, reader, cpu_count(), 512)