当前位置:网站首页>opencv+paddle orc 识别图片提取表格信息
opencv+paddle orc 识别图片提取表格信息
2022-07-28 06:30:00 【路新航】
思路:
1.提取出横线
2.提取出纵线
3.得到交叉点,删除错误的交叉点,两个交叉点距离小于10,取坐标值小的那个交叉点,得到表格行列
4.对每个单元格使用paddle ocr提取文字
在原文代码基础上修改了2点
1.pytesseract识别准确率不高,使用paddle ocr代替 pytesseract
2.识别出的表格交叉点有些并非真实交叉点,通过判断该行像素点个数,丢掉错误横纵坐标
import cv2
import numpy as np
import pandas as pd
# import pytesseract
# import re
from paddleocr import PaddleOCR
src = 'image.png'
raw = cv2.imread(src, 1)
# 灰度图片
gray = cv2.cvtColor(raw, cv2.COLOR_BGR2GRAY)
# 图片二值化 使二值化后的图片是黑底白字
binary = cv2.adaptiveThreshold(~gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, -5)
# src:灰度化的图片
# maxValue:满足条件的像素点需要设置的灰度值
# adaptiveMethod:自适应方法。有2种:ADAPTIVE_THRESH_MEAN_C 或 ADAPTIVE_THRESH_GAUSSIAN_C
# thresholdType:二值化方法,可以设置为THRESH_BINARY或者THRESH_BINARY_INV
# blockSize:分割计算的区域大小,取奇数
# C:常数,每个区域计算出的阈值的基础上在减去这个常数作为这个区域的最终阈值,可以为负数
# dst:输出图像,可选
# 展示图片
# cv2.imshow("binary_picture", binary)
# cv2.waitKey()
def recognize_bgkx(binary):
rows, cols = binary.shape
scale = 30 # 值越小 横线越少 40
# 自适应获取核值
# 识别横线:
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (cols // scale, 1))
# 矩形
eroded = cv2.erode(binary, kernel, iterations=1) # 腐蚀
dilated_col = cv2.dilate(eroded, kernel, iterations=1) # 膨胀
# cv2.imshow("excel_horizontal_line", dilated_col)
# cv2.waitKey()
# 识别竖线:
scale = 20
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, rows // scale))
eroded = cv2.erode(binary, kernel, iterations=1)
dilated_row = cv2.dilate(eroded, kernel, iterations=1)
# cv2.imshow("excel_vertical_line:", dilated_row)
# cv2.waitKey()
# 将识别出来的横竖线合起来 对二进制数据进行“与”操作
bitwise_and = cv2.bitwise_and(dilated_col, dilated_row)
cv2.imshow("excel_bitwise_and", bitwise_and)
cv2.waitKey()
# 标识表格轮廓
# merge = cv2.add(dilated_col, dilated_row) # 进行图片的加和
# cv2.imshow("entire_excel_contour:", merge)
# cv2.waitKey()
# 两张图片进行减法运算,去掉表格框线
# merge2 = cv2.subtract(binary, merge)
# cv2.imshow("binary_sub_excel_rect", merge2)
# cv2.waitKey()
# new_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
# erode_image = cv2.morphologyEx(merge2, cv2.MORPH_OPEN, new_kernel)
# cv2.imshow('erode_image2', erode_image)
# cv2.waitKey()
# merge3 = cv2.add(erode_image, bitwise_and)
# cv2.imshow('merge3', merge3)
# cv2.waitKey()
# 将焦点标识取出来
ys, xs = np.where(bitwise_and > 0)
# 横纵坐标数组
y_point_arr = []
x_point_arr = []
# 通过排序,排除掉相近的像素点,只取相近值的最后一点
# 这个10就是两个像素点的距离,不是固定的,根据不同的图片会有调整,基本上为单元格表格的高度(y坐标跳变)和长度(x坐标跳变)
i = 0
sort_x_point = np.sort(xs)
for i in range(len(sort_x_point) - 1):
if sort_x_point[i + 1] - sort_x_point[i] > 10:
x_point_arr.append(sort_x_point[i])
i = i + 1
# 要将最后一个点加入
x_point_arr.append(sort_x_point[i])
i = 0
sort_y_point = np.sort(ys)
# print(np.sort(ys))
for i in range(len(sort_y_point) - 1):
if (sort_y_point[i + 1] - sort_y_point[i] > 10):
y_point_arr.append(sort_y_point[i])
i = i + 1
y_point_arr.append(sort_y_point[i])
# 横纵坐标超过3个 代表点对应的行、列超过2
data = pd.DataFrame(bitwise_and)
drop_y_list = []
for i in y_point_arr:
# for j in x_point_arr: data[(data.loc[i-1,:]>0)].index.tolist()
# y_dot_num = [x for x in data.loc[i-1,:] if x!=0]
y_dot_num = 0
y_dot_num += len(data.loc[(data.loc[i, :] > 0)].index.tolist())
for j in range(1, 5):
y_dot_num += len(data.loc[(data.loc[i + j, :] > 0)].index.tolist())
y_dot_num += len(data.loc[(data.loc[i - j, :] > 0)].index.tolist())
if y_dot_num < 5:
print('纵坐标%s并不在框线上删除,该行只有%s个像素' % (i, y_dot_num))
drop_y_list.append(i)
for y in drop_y_list:
y_point_arr.remove(y)
drop_x_list = []
for m in x_point_arr:
# for j in x_point_arr: data[(data.loc[i-1,:]>0)].index.tolist()
# y_dot_num = [x for x in data.loc[i-1,:] if x!=0]
x_dot_num = 0
x_dot_num += len(data.loc[(data.loc[:, m] > 0)].index.tolist())
print('检测', m)
for n in range(1, 5):
x_dot_num += len(data.loc[(data.loc[:, m + n] > 0)].index.tolist())
x_dot_num += len(data.loc[(data.loc[:, m - n] > 0)].index.tolist())
if x_dot_num < 5:
print('横坐标坐标%s并不在框线上删除 ,该列只有%s个像素' % (m, x_dot_num))
drop_x_list.append(m)
for x in drop_x_list:
x_point_arr.remove(x)
print('该表格有%s行%s列 ' % (len(y_point_arr) - 1, len(x_point_arr) - 1))
return x_point_arr, y_point_arr
x_point_arr, y_point_arr = recognize_bgkx(binary)
# 退后
ocr = PaddleOCR(use_angle_cls=True, lang="ch")
def recognize_text_by_loop():
y_point_arr = [x - 3 for x in y_point_arr]
# 循环y坐标,x坐标分割表格
data = [[] for i in range(len(y_point_arr))]
for i in range(len(y_point_arr) - 1):
# if i==0:
# continue
for j in range(len(x_point_arr) - 1):
# 在分割时,第一个参数为y坐标,第二个参数为x坐标
cell = gray[
y_point_arr[i]:y_point_arr[i + 1],
x_point_arr[j]:x_point_arr[j + 1]
]
# cv2.imshow("sub_pic" + str(i) + str(j), cell)
# cv2.waitKey()
# cv2.destroyAllWindows()
img_path = "cell_image_"+str(i)+'_'+str(j)+".png"
cv2.imwrite(img_path,cell)
# 输入待识别图片路径
# 输出结果保存路径
result = ocr.ocr(img_path, cls=True)
text1 = ''.join([x[1][0] for x in result])
print(text1)
data[i].append(text1)
j = j + 1
i = i + 1
print(data)
table_coordinate = pd.DataFrame(data[1:-1],columns=data[0])
table_coordinate.to_excel('saomiaojian.xlsx',index=False)
待优化:
- 真实图片的单元格并不在一条横坐标或纵坐标下,使用框线交叉点坐标确定单元格,实际图像不够完整,待后续优化,获取每个单元格坐标
边栏推荐
- 2022/7/27 考试总结
- 数字签名和CA证书
- awk从入门到入土(16)awk变量类型探讨--关于数字和string两种类型
- Rk3568 development board installation system startup
- Kubernetes技术与架构(七)
- Find out whether the number exists from the matrix
- Use ffmpeg to generate single image + single audio streaming video in batches
- In QT multithreading, in which thread does the slot function perform analysis
- CarSim simulation quick start (XI) - Driver Model (1)
- Mysql中有哪些不同的表格?
猜你喜欢

XSS knowledge points and 20 character short domain name bypass

数字签名和CA证书

Basic dictionary of deep learning --- activation function, batch size, normalization
![In the task manager, the CPU speed displayed is greater than its maximum speed [main frequency]](/img/90/a3f56ef8f08a8735febba16af227f9.png)
In the task manager, the CPU speed displayed is greater than its maximum speed [main frequency]

深度学习基础宝典---激活函数、Batch Size、归一化

Chairman tree review

本人男,27岁技术经理,收入太高,心头慌得一比

Information system project manager must recite the core examination site (41) risk management plan

A group of South University students rely on science and technology to go to sea, with an annual income of 1billion

【活动报名】云原生技术交流 Meetup,8 月 6 日广州见
随机推荐
MySQL query error [err] 1046 - no database selected
New generation cloud native message queue (II)
Can the variable modified by final be modified
MPLS --- 多协议标签交换技术
Talk about synchronous, asynchronous, blocking and non blocking
Meituan Er Mian: why does redis have sentinels?
JS candy xiaoxiaole game source code
解析树形结构 js
数字签名和CA证书
What if you are prompted that your connection to this website is not a private connection?
【13】 Adder: how to build a circuit like Lego (Part 1)?
Technology sharing | common proxy tools for interface testing
The core packages and middleware required for golang development cover all areas of the project and are worth collecting
CarSim simulation quick start (XIII) - steering system
[environment configuration] ppyoole trains its own data set (for its own use)
使用FFmpeg来批量生成单图+单音频的一图流视频
C#,入门教程——程序运行时的调试技巧与逻辑错误探针技术与源代码
mysql,可以使用多少列创建索引?
XSS knowledge points and 20 character short domain name bypass
Tell you step by step what you need to do to apply for PMP? What should I do?