当前位置:网站首页>二、OCR训练时,将txt文件和图片数据转为lmdb文件格式
二、OCR训练时,将txt文件和图片数据转为lmdb文件格式
2022-07-29 05:22:00 【MY头发乱了】
前言
随着人工智能的不断发展,机器学习这门技术也越来越重要,本文就介绍OCR训练前ldmb文件制作的基础内容。提示:以下是本篇文章正文内容,下面案例可供参考
一、背景?
示例:是基于ocr训练而作。
二、直接上内容
1.代码
代码如下(示例):
# -*- coding:utf-8 -*-
import os
import lmdb # ?pip install?????
import cv2
import glob #????????????
import numpy as np
def checkImageIsValid(imageBin):
if imageBin is None:
return False
imageBuf = np.fromstring(imageBin, dtype=np.uint8)
img = cv2.imdecode(imageBuf, cv2.IMREAD_GRAYSCALE)
if img is None:
return False
imgH, imgW = img.shape[0], img.shape[1]
if imgH * imgW == 0:
return False
return True
def writeCache(env, cache):
with env.begin(write=True) as txn:
# for k, v in cache.iteritems(): #python2
for k, v in cache.items(): #python3
txn.put(k.encode(), str(v).encode())
def createDataset(outputPath, imagePathList, labelList, lexiconList=None, checkValid=True):
"""
Create LMDB dataset for CRNN training.
# ARGS:
outputPath : LMDB output path
imagePathList : list of image path
labelList : list of corresponding groundtruth texts
lexiconList : (optional) list of lexicon lists
checkValid : if true, check the validity of every image
"""
# print (len(imagePathList) , len(labelList))
assert (len(imagePathList) == len(labelList))
nSamples = len(imagePathList)
print('...................')
env = lmdb.open(outputPath, map_size=8589934592) # 1099511627776)????????????????1T?????8g???????????????????
cache = {
}
cnt = 1
for i in range(nSamples):
imagePath = imagePathList[i]
label = labelList[i]
if not os.path.exists(imagePath):
print('%s does not exist' % imagePath)
continue
with open(imagePath, 'r') as f:
imageBin = f.read()
if checkValid:
if not checkImageIsValid(imageBin):
print('%s is not a valid image' % imagePath) # ??????linux????f.read??????????????
continue
imageKey = 'image-%09d' % cnt
labelKey = 'label-%09d' % cnt
cache[imageKey] = imageBin
cache[labelKey] = label
if lexiconList:
lexiconKey = 'lexicon-%09d' % cnt
cache[lexiconKey] = ' '.join(lexiconList[i])
if cnt % 1000 == 0:
writeCache(env, cache)
cache = {
}
print('Written %d / %d' % (cnt, nSamples))
cnt += 1
nSamples = cnt - 1
cache['num-samples'] = str(nSamples)
writeCache(env, cache)
print('Created dataset with %d samples' % nSamples)
def read_text(path):
with open(path) as f:
text = f.read()
text = text.strip()
return text
if __name__ == '__main__':
# lmdb ????
outputPath = r'E:\enducate\test_paper\Train_code\train' # ?????????????????????
path = r"E:\enducate\test_paper\Train_code\data22222\*.png" # ?txt?jpg???????????
imagePathList = glob.glob(path)
print('------------', len(imagePathList), '------------')
imgLabelLists = []
for p in imagePathList:
try:
imgLabelLists.append((p, read_text(p.replace('.jpg', '.txt'))))
except:
continue
# imgLabelList = [ (p, read_text(p.replace('.jpg', '.txt'))) for p in imagePathList]
# sort by labelList
imgLabelList = sorted(imgLabelLists, key=lambda x: len(x[1]))
imgPaths = [p[0] for p in imgLabelList]
txtLists = [p[1] for p in imgLabelList]
createDataset(outputPath, imgPaths, txtLists, lexiconList=None, checkValid=True)
2.文件说明
第一种文件格式
图片路径和txt标签文件共存。(1张图片对应1个txt标签文件)
txt文件和图片是放在一个文件夹的。
第二种文件格式如下:
文件有多张图片和1个txt文件组成,其中txt文件是包括所有图片的标签。
格式为:图片路径名+\t+标签。
// A code block
var foo = 'bar';
// An highlighted block
""" a modified version of CRNN torch repository https://github.com/bgshih/crnn/blob/master/tool/create_dataset.py """
import fire
import os
import lmdb
import cv2
import numpy as np
def checkImageIsValid(imageBin):
if imageBin is None:
return False
imageBuf = np.frombuffer(imageBin, dtype=np.uint8)
img = cv2.imdecode(imageBuf, cv2.IMREAD_GRAYSCALE)
imgH, imgW = img.shape[0], img.shape[1]
if imgH * imgW == 0:
return False
return True
def writeCache(env, cache):
with env.begin(write=True) as txn:
for k, v in cache.items():
txn.put(k, v)
def createDataset(inputPath, gtFile, outputPath, checkValid=True):
"""
Create LMDB dataset for training and evaluation.
ARGS:
inputPath : input folder path where starts imagePath
outputPath : LMDB output path
gtFile : list of image path and label
checkValid : if true, check the validity of every image
"""
os.makedirs(outputPath, exist_ok=True)
env = lmdb.open(outputPath, map_size=1099511627776)
cache = {
}
cnt = 1
with open(gtFile, 'r', encoding='utf-8') as data:
datalist = data.readlines()
nSamples = len(datalist)
for i in range(nSamples):
imagePath, label = datalist[i].strip('\n').split('\t')
# imagePath = os.path.join(inputPath, imagePath)
# # only use alphanumeric data
# if re.search('[^a-zA-Z0-9]', label):
# continue
if not os.path.exists(imagePath):
print('%s does not exist' % imagePath)
continue
with open(imagePath, 'rb') as f:
imageBin = f.read()
if checkValid:
try:
if not checkImageIsValid(imageBin):
print('%s is not a valid image' % imagePath)
continue
except:
print('error occured', i)
with open(outputPath + '/error_image_log.txt', 'a') as log:
log.write('%s-th image data occured error\n' % str(i))
continue
imageKey = 'image-%09d'.encode() % cnt
labelKey = 'label-%09d'.encode() % cnt
cache[imageKey] = imageBin
cache[labelKey] = label.encode()
if cnt % 1000 == 0:
writeCache(env, cache)
cache = {
}
print('Written %d / %d' % (cnt, nSamples))
cnt += 1
nSamples = cnt-1
cache['num-samples'.encode()] = str(nSamples).encode()
writeCache(env, cache)
print('Created dataset with %d samples' % nSamples)
if __name__ == '__main__':
fire.Fire(createDataset)
# python create_lmdb_dataset.py --inputPath /data2/ --gtFile /data2/meterdataset/digital_dataset/otherdataset/1030_data/collect_val.txt --outputPath /data2/meterdataset/digital_dataset/otherdataset/1030_data/2021-0507-result/val
第二个运行说明:
> 输入路径:inputPath 操作的文件夹
txt文件: gtFile
ldmb输出的文件路径: outputPath
// An highlighted block
python create_lmdb_dataset.py --inputPath /data2/ --gtFile /data2/enducation/paper_recog_total/train-paper-recog/Recognization/deep-text-recognition-SHENG/data/text_recog/txt4val/img_gt/gt.txt --outputPath /data2/enducation/paper_recog_total/train-paper-recog/Recognization/deep-text-recognition-SHENG/data/val
边栏推荐
- Reporting Services- Web Service
- [DL] build convolutional neural network for regression prediction (detailed tutorial of data + code)
- 【pycharm】pycharm远程连接服务器
- Rsync+inotyfy realize real-time synchronization of single data monitoring
- 虚假新闻检测论文阅读(一):Fake News Detection using Semi-Supervised Graph Convolutional Network
- Nailing alarm script
- Flutter 绘制技巧探索:一起画箭头(技巧拓展)
- 【Transformer】AdaViT: Adaptive Tokens for Efficient Vision Transformer
- C # judge whether the user accesses by mobile phone or computer
- Briefly talk about the difference between pendingintent and intent
猜你喜欢

Semaphore (semaphore) for learning notes of concurrent programming
![[go] use of defer](/img/10/9e4e1c593870450c381a154f31ebef.png)
[go] use of defer
![[database] database course design - vaccination database](/img/4d/e8aff67e3c643fae651c9f62af2db9.png)
[database] database course design - vaccination database
![[target detection] KL loss: bounding box progression with uncertainty for accurate object detection](/img/8c/1a561fab040730ae29901a04b70ac4.png)
[target detection] KL loss: bounding box progression with uncertainty for accurate object detection

【语义分割】Mapillary 数据集简介

Spring, summer, autumn and winter with Miss Zhang (2)

迁移学习——Robust Visual Domain Adaptation with Low-Rank Reconstruction

How to PR an open source composer project

Are you sure you know the interaction problem of activity?

【Transformer】AdaViT: Adaptive Tokens for Efficient Vision Transformer
随机推荐
IDEA中设置自动build-改动代码,不用重启工程,刷新页面即可
Spring, summer, autumn and winter with Miss Zhang (1)
GAN:生成对抗网络 Generative Adversarial Networks
【ML】机器学习模型之PMML--概述
深入理解MMAP原理,让大厂都爱不释手的技术
C # judge whether the user accesses by mobile phone or computer
【go】defer的使用
mysql 的show profiles 使用。
File permissions of day02 operation
Spring, summer, autumn and winter with Miss Zhang (2)
Are you sure you know the interaction problem of activity?
迁移学习——Robust Visual Domain Adaptation with Low-Rank Reconstruction
【语义分割】Mapillary 数据集简介
研究生新生培训第三周:ResNet+ResNeXt
虚假新闻检测论文阅读(四):A novel self-learning semi-supervised deep learning network to detect fake news on...
[pycharm] pycharm remote connection server
[competition website] collect machine learning / deep learning competition website (continuously updated)
Flink, the mainstream real-time stream processing computing framework, is the first experience.
Anr Optimization: cause oom crash and corresponding solutions
PyTorch基础知识(可入门)