当前位置:网站首页>C#/VB.NET:从 PDF 文档中提取所有表格
C#/VB.NET:从 PDF 文档中提取所有表格
2022-08-01 18:00:00 【InfoQ】
安装 Spire.PDF for .NET
从PDF文档中提取表格
using Spire.Pdf;
using Spire.Pdf.Utilities;
using System.IO;
using System.Text;
namespace ExtractTable
{
class Program
{
static void Main(string[] args)
{
//实例化PdfDocument类的对象
PdfDocument pdf = new PdfDocument();
//加载PDF文档
pdf.LoadFromFile("sample.pdf");
//创建StringBuilder类的对象
StringBuilder builder = new StringBuilder();
//实例化PdfTableExtractor类的对象
PdfTableExtractor extractor = new PdfTableExtractor(pdf);
//声明一个PdfTable类的表格数组
PdfTable[] tableLists;
//遍历PDF页面
for (int pageIndex = 0; pageIndex < pdf.Pages.Count; pageIndex++)
{
//从页面提取表格
tableLists = extractor.ExtractTable(pageIndex);
//判断表格列表是否为空
if (tableLists != null && tableLists.Length > 0)
{
//遍历表格
foreach (PdfTable table in tableLists)
{
//获取表格中的行和列数
int row = table.GetRowCount();
int column = table.GetColumnCount();
//遍历表格行和列
for (int i = 0; i < row; i++)
{
for (int j = 0; j < column; j++)
{
//获取行和列中的文本
string text = table.GetText(i, j);
//写入文本到StringBuilder容器
builder.Append(text + " ");
}
builder.Append("\r\n");
}
}
}
}
//保存提取的表格内容为.txt文档
File.WriteAllText("ExtractedTable.txt", builder.ToString());
}
}
}
Imports Spire.Pdf
Imports Spire.Pdf.Utilities
Imports System.IO
Imports System.Text
Namespace ExtractTable
Class Program
Private Shared Sub Main(args As String())
'实例化PdfDocument类的对象
Dim pdf As New PdfDocument()
'加载PDF文档
pdf.LoadFromFile("sample.pdf")
'创建StringBuilder类的对象
Dim builder As New StringBuilder()
'实例化PdfTableExtractor类的对象
Dim extractor As New PdfTableExtractor(pdf)
'声明一个PdfTable类的表格数组
Dim tableLists As PdfTable()
'遍历PDF页面
For pageIndex As Integer = 0 To pdf.Pages.Count - 1
'从页面提取表格
tableLists = extractor.ExtractTable(pageIndex)
'判断表格列表是否为空
If tableLists IsNot Nothing AndAlso tableLists.Length > 0 Then
'遍历表格
For Each table As PdfTable In tableLists
'获取表格中的行和列数
Dim row As Integer = table.GetRowCount()
Dim column As Integer = table.GetColumnCount()
'遍历表格行和列
For i As Integer = 0 To row - 1
For j As Integer = 0 To column - 1
'获取行和列中的文本
Dim text As String = table.GetText(i, j)
'写入文本到StringBuilder容器
builder.Append(text & Convert.ToString(" "))
Next
builder.Append(vbCr & vbLf)
Next
Next
End If
Next
'保存提取的表格内容为.txt文档
File.WriteAllText("ExtractedTable.txt", builder.ToString())
End Sub
End Class
End Namespace


边栏推荐
猜你喜欢

MySQL 45 Talk | 09 How to choose common index and unique index?

【Day_12 0507】查找组成一个偶数最接近的两个素数

RecSys'22|CARCA: Cross-Attention-Aware Context and Attribute Recommendations

Leetcode72. Edit Distance

TCP million concurrent server optimization parameters

【报错】Uncaught (in promise) TypeError: Cannot read properties of undefined (reading ‘concat‘)

【Day_09 0427】 另类加法
Detailed explanation of DBPack SQL Tracing function and data encryption function

Leetcode74. 搜索二维矩阵

LeaRun.net快速开发动态表单
随机推荐
SQL的索引详细介绍
2022年MySQL最新面试题
RecSys'22|CARCA:交叉注意力感知上下文和属性进行推荐
小贝拉机器人是朋友_普渡科技召开新品发布会,新一代送餐机器人“贝拉”温暖登场...
解决MySQL插入不了中文数据问题
塔防海岸线用户协议
关于MySql中explain结果filtered的理解
2022年SQL经典面试题总结(带解析)
EpiSci | Deep Reinforcement Learning for SoCs: Myth and Reality
极化微波成像概述
Review实战经典:2 种封装风格,你偏爱哪种?
QT basic functions, signals, slots
Shell nl命令详解(显示行号、读取文件)
B001 - 基于STM32的智能生态鱼缸
极化微波成像概述2
QLineEdit learning and use
B005 - STC8 based single chip microcomputer intelligent street light control system
【Day_11 0506】求最大连续bit数
QPalette palette, frame color fill
Leetcode72. 编辑距离