当前位置:网站首页>C#/VB.NET:从 PDF 文档中提取所有表格
C#/VB.NET:从 PDF 文档中提取所有表格
2022-08-01 18:00:00 【InfoQ】
安装 Spire.PDF for .NET
从PDF文档中提取表格
using Spire.Pdf;
using Spire.Pdf.Utilities;
using System.IO;
using System.Text;
namespace ExtractTable
{
class Program
{
static void Main(string[] args)
{
//实例化PdfDocument类的对象
PdfDocument pdf = new PdfDocument();
//加载PDF文档
pdf.LoadFromFile("sample.pdf");
//创建StringBuilder类的对象
StringBuilder builder = new StringBuilder();
//实例化PdfTableExtractor类的对象
PdfTableExtractor extractor = new PdfTableExtractor(pdf);
//声明一个PdfTable类的表格数组
PdfTable[] tableLists;
//遍历PDF页面
for (int pageIndex = 0; pageIndex < pdf.Pages.Count; pageIndex++)
{
//从页面提取表格
tableLists = extractor.ExtractTable(pageIndex);
//判断表格列表是否为空
if (tableLists != null && tableLists.Length > 0)
{
//遍历表格
foreach (PdfTable table in tableLists)
{
//获取表格中的行和列数
int row = table.GetRowCount();
int column = table.GetColumnCount();
//遍历表格行和列
for (int i = 0; i < row; i++)
{
for (int j = 0; j < column; j++)
{
//获取行和列中的文本
string text = table.GetText(i, j);
//写入文本到StringBuilder容器
builder.Append(text + " ");
}
builder.Append("\r\n");
}
}
}
}
//保存提取的表格内容为.txt文档
File.WriteAllText("ExtractedTable.txt", builder.ToString());
}
}
}
Imports Spire.Pdf
Imports Spire.Pdf.Utilities
Imports System.IO
Imports System.Text
Namespace ExtractTable
Class Program
Private Shared Sub Main(args As String())
'实例化PdfDocument类的对象
Dim pdf As New PdfDocument()
'加载PDF文档
pdf.LoadFromFile("sample.pdf")
'创建StringBuilder类的对象
Dim builder As New StringBuilder()
'实例化PdfTableExtractor类的对象
Dim extractor As New PdfTableExtractor(pdf)
'声明一个PdfTable类的表格数组
Dim tableLists As PdfTable()
'遍历PDF页面
For pageIndex As Integer = 0 To pdf.Pages.Count - 1
'从页面提取表格
tableLists = extractor.ExtractTable(pageIndex)
'判断表格列表是否为空
If tableLists IsNot Nothing AndAlso tableLists.Length > 0 Then
'遍历表格
For Each table As PdfTable In tableLists
'获取表格中的行和列数
Dim row As Integer = table.GetRowCount()
Dim column As Integer = table.GetColumnCount()
'遍历表格行和列
For i As Integer = 0 To row - 1
For j As Integer = 0 To column - 1
'获取行和列中的文本
Dim text As String = table.GetText(i, j)
'写入文本到StringBuilder容器
builder.Append(text & Convert.ToString(" "))
Next
builder.Append(vbCr & vbLf)
Next
Next
End If
Next
'保存提取的表格内容为.txt文档
File.WriteAllText("ExtractedTable.txt", builder.ToString())
End Sub
End Class
End Namespace


边栏推荐
猜你喜欢
XAML WPF item groupBox control
频域分析实践介绍
Xingtu has been short of disruptive products?Will this M38T from the Qingdao factory be a breakthrough?
Detailed explanation of the working principle of crystal oscillator
How to use the Golang coroutine scheduler scheduler
粒子滤波 particle filter —从贝叶斯滤波到粒子滤波——Part-I(贝叶斯滤波)
QLineEdit learning and use
C language theory--a solid foundation for the written test and interview
QT_QThread thread
存储日报-数据湖架构权威指南(使用 Iceberg 和 MinIO)
随机推荐
频域分析实践介绍
Leetcode72. Edit Distance
XAML WPF项目groupBox控件
关于单应性矩阵的若干思考
【Day_08 0426】求最小公倍数
Leetcode73. Matrix Zeroing
Leetcode74. 搜索二维矩阵
CodeTON Round 2 (Div. 1 + Div. 2, Rated, Prizes!) 题解
TCP million concurrent server optimization parameters
不需要写代码,快速批量修改文件夹中图片的格式
【无标题】setInterval和setTimeout详解
tooltip 控件
Leetcode71. 简化路径
移动端吸顶方案
MySql 怎么查出符合条件的最新的数据行?
EpiSci | Deep Reinforcement Learning for SoCs: Myth and Reality
GRUB2的零日漏洞补丁现已推出
el-form-item prop属性动态绑定不生效如何解决
hcip第九天
【翻译】CNCF培养的OpenMetrics成为一个孵化项目