当前位置:网站首页>二、OCR训练时,将txt文件和图片数据转为lmdb文件格式
二、OCR训练时,将txt文件和图片数据转为lmdb文件格式
2022-07-29 05:22:00 【MY头发乱了】
前言
随着人工智能的不断发展,机器学习这门技术也越来越重要,本文就介绍OCR训练前ldmb文件制作的基础内容。提示:以下是本篇文章正文内容,下面案例可供参考
一、背景?
示例:是基于ocr训练而作。
二、直接上内容
1.代码
代码如下(示例):
# -*- coding:utf-8 -*-
import os
import lmdb # ?pip install?????
import cv2
import glob #????????????
import numpy as np
def checkImageIsValid(imageBin):
if imageBin is None:
return False
imageBuf = np.fromstring(imageBin, dtype=np.uint8)
img = cv2.imdecode(imageBuf, cv2.IMREAD_GRAYSCALE)
if img is None:
return False
imgH, imgW = img.shape[0], img.shape[1]
if imgH * imgW == 0:
return False
return True
def writeCache(env, cache):
with env.begin(write=True) as txn:
# for k, v in cache.iteritems(): #python2
for k, v in cache.items(): #python3
txn.put(k.encode(), str(v).encode())
def createDataset(outputPath, imagePathList, labelList, lexiconList=None, checkValid=True):
"""
Create LMDB dataset for CRNN training.
# ARGS:
outputPath : LMDB output path
imagePathList : list of image path
labelList : list of corresponding groundtruth texts
lexiconList : (optional) list of lexicon lists
checkValid : if true, check the validity of every image
"""
# print (len(imagePathList) , len(labelList))
assert (len(imagePathList) == len(labelList))
nSamples = len(imagePathList)
print('...................')
env = lmdb.open(outputPath, map_size=8589934592) # 1099511627776)????????????????1T?????8g???????????????????
cache = {
}
cnt = 1
for i in range(nSamples):
imagePath = imagePathList[i]
label = labelList[i]
if not os.path.exists(imagePath):
print('%s does not exist' % imagePath)
continue
with open(imagePath, 'r') as f:
imageBin = f.read()
if checkValid:
if not checkImageIsValid(imageBin):
print('%s is not a valid image' % imagePath) # ??????linux????f.read??????????????
continue
imageKey = 'image-%09d' % cnt
labelKey = 'label-%09d' % cnt
cache[imageKey] = imageBin
cache[labelKey] = label
if lexiconList:
lexiconKey = 'lexicon-%09d' % cnt
cache[lexiconKey] = ' '.join(lexiconList[i])
if cnt % 1000 == 0:
writeCache(env, cache)
cache = {
}
print('Written %d / %d' % (cnt, nSamples))
cnt += 1
nSamples = cnt - 1
cache['num-samples'] = str(nSamples)
writeCache(env, cache)
print('Created dataset with %d samples' % nSamples)
def read_text(path):
with open(path) as f:
text = f.read()
text = text.strip()
return text
if __name__ == '__main__':
# lmdb ????
outputPath = r'E:\enducate\test_paper\Train_code\train' # ?????????????????????
path = r"E:\enducate\test_paper\Train_code\data22222\*.png" # ?txt?jpg???????????
imagePathList = glob.glob(path)
print('------------', len(imagePathList), '------------')
imgLabelLists = []
for p in imagePathList:
try:
imgLabelLists.append((p, read_text(p.replace('.jpg', '.txt'))))
except:
continue
# imgLabelList = [ (p, read_text(p.replace('.jpg', '.txt'))) for p in imagePathList]
# sort by labelList
imgLabelList = sorted(imgLabelLists, key=lambda x: len(x[1]))
imgPaths = [p[0] for p in imgLabelList]
txtLists = [p[1] for p in imgLabelList]
createDataset(outputPath, imgPaths, txtLists, lexiconList=None, checkValid=True)
2.文件说明
第一种文件格式
图片路径和txt标签文件共存。(1张图片对应1个txt标签文件)
txt文件和图片是放在一个文件夹的。
第二种文件格式如下:
文件有多张图片和1个txt文件组成,其中txt文件是包括所有图片的标签。
格式为:图片路径名+\t+标签。
// A code block
var foo = 'bar';
// An highlighted block
""" a modified version of CRNN torch repository https://github.com/bgshih/crnn/blob/master/tool/create_dataset.py """
import fire
import os
import lmdb
import cv2
import numpy as np
def checkImageIsValid(imageBin):
if imageBin is None:
return False
imageBuf = np.frombuffer(imageBin, dtype=np.uint8)
img = cv2.imdecode(imageBuf, cv2.IMREAD_GRAYSCALE)
imgH, imgW = img.shape[0], img.shape[1]
if imgH * imgW == 0:
return False
return True
def writeCache(env, cache):
with env.begin(write=True) as txn:
for k, v in cache.items():
txn.put(k, v)
def createDataset(inputPath, gtFile, outputPath, checkValid=True):
"""
Create LMDB dataset for training and evaluation.
ARGS:
inputPath : input folder path where starts imagePath
outputPath : LMDB output path
gtFile : list of image path and label
checkValid : if true, check the validity of every image
"""
os.makedirs(outputPath, exist_ok=True)
env = lmdb.open(outputPath, map_size=1099511627776)
cache = {
}
cnt = 1
with open(gtFile, 'r', encoding='utf-8') as data:
datalist = data.readlines()
nSamples = len(datalist)
for i in range(nSamples):
imagePath, label = datalist[i].strip('\n').split('\t')
# imagePath = os.path.join(inputPath, imagePath)
# # only use alphanumeric data
# if re.search('[^a-zA-Z0-9]', label):
# continue
if not os.path.exists(imagePath):
print('%s does not exist' % imagePath)
continue
with open(imagePath, 'rb') as f:
imageBin = f.read()
if checkValid:
try:
if not checkImageIsValid(imageBin):
print('%s is not a valid image' % imagePath)
continue
except:
print('error occured', i)
with open(outputPath + '/error_image_log.txt', 'a') as log:
log.write('%s-th image data occured error\n' % str(i))
continue
imageKey = 'image-%09d'.encode() % cnt
labelKey = 'label-%09d'.encode() % cnt
cache[imageKey] = imageBin
cache[labelKey] = label.encode()
if cnt % 1000 == 0:
writeCache(env, cache)
cache = {
}
print('Written %d / %d' % (cnt, nSamples))
cnt += 1
nSamples = cnt-1
cache['num-samples'.encode()] = str(nSamples).encode()
writeCache(env, cache)
print('Created dataset with %d samples' % nSamples)
if __name__ == '__main__':
fire.Fire(createDataset)
# python create_lmdb_dataset.py --inputPath /data2/ --gtFile /data2/meterdataset/digital_dataset/otherdataset/1030_data/collect_val.txt --outputPath /data2/meterdataset/digital_dataset/otherdataset/1030_data/2021-0507-result/val
第二个运行说明:
> 输入路径:inputPath 操作的文件夹
txt文件: gtFile
ldmb输出的文件路径: outputPath
// An highlighted block
python create_lmdb_dataset.py --inputPath /data2/ --gtFile /data2/enducation/paper_recog_total/train-paper-recog/Recognization/deep-text-recognition-SHENG/data/text_recog/txt4val/img_gt/gt.txt --outputPath /data2/enducation/paper_recog_total/train-paper-recog/Recognization/deep-text-recognition-SHENG/data/val
边栏推荐
- 【语义分割】Fully Attentional Network for Semantic Segmentation
- 迁移学习—— Transfer Feature Learning with Joint Distribution Adaptation
- MySql统计函数COUNT详解
- 【目标检测】KL-Loss:Bounding Box Regression with Uncertainty for Accurate Object Detection
- Spring, summer, autumn and winter with Miss Zhang (5)
- Ribbon learning notes II
- pip安装后仍有解决ImportError: No module named XX
- Analysis on the principle of flow
- 在uni-app项目中,如何实现微信小程序openid的获取
- 【Transformer】SegFormer:Simple and Efficient Design for Semantic Segmentation with Transformers
猜你喜欢
PHP write a diaper to buy the lowest price in the whole network
FFmpeg创作GIF表情包教程来了!赶紧说声多谢乌蝇哥?
fastText学习——文本分类
[ml] PMML of machine learning model -- Overview
【ML】机器学习模型之PMML--概述
The third week of postgraduate freshman training: resnet+resnext
PyTorch基础知识(可入门)
研究生新生培训第三周:ResNet+ResNeXt
ABSA1: Attentional Encoder Network for Targeted Sentiment Classification
【Attention】Visual Attention Network
随机推荐
[convolution kernel design] scaling up your kernels to 31x31: revising large kernel design in CNN
[clustmaps] visitor statistics
【网络设计】ConvNeXt:A ConvNet for the 2020s
Research on the implementation principle of reentrantlock in concurrent programming learning notes
电脑视频暂停再继续,声音突然变大
Valuable blog and personal experience collection (continuous update)
ASM piling: after learning ASM tree API, you don't have to be afraid of hook anymore
【语义分割】SETR_Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective with Transformer
tensorboard使用
Reporting service 2016 custom authentication
虚假新闻检测论文阅读(四):A novel self-learning semi-supervised deep learning network to detect fake news on...
Detailed explanation of tool classes countdownlatch and cyclicbarrier of concurrent programming learning notes
Briefly talk about the difference between pendingintent and intent
[overview] image classification network
Technology that deeply understands the principle of MMAP and makes big manufacturers love it
并发编程学习笔记 之 ReentrantLock实现原理的探究
Spring, summer, autumn and winter with Miss Zhang (3)
【目标检测】KL-Loss:Bounding Box Regression with Uncertainty for Accurate Object Detection
fastText学习——文本分类
第三周周报 ResNet+ResNext