当前位置:网站首页>Pdf extract text
Pdf extract text
2022-07-27 16:30:00 【Sweet scented osmanthus is very fragrant, and the rising sun is】
import os
import pandas as pd
import sys
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
from pdfminer.image import ImageWriter
# debug option
debug = 1
# input option
password = b''
pagenos = set()
maxpages = 0
# output option
outtype = 'text'
imagewriter = None
rotation = 0
stripcontrol = False
layoutmode = 'normal'
encoding = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFPageInterpreter.debug = debug
# Traverse all under the directory text file
import time
txt_list = []
for root, dirs, files in os.walk("pdf_sentence/", topdown=False):
for i,name in enumerate(files):
rsrcmgr = PDFResourceManager(caching=caching)
# Get the absolute address of the file
end = name.split('.')[1]
if 'pdf'in end:
print(i,name)
fname = name
outfile = name.split('.')[0] + '.txt'
outfp = open("pdf_txt/"+outfile, 'a+', encoding=encoding)
device = TextConverter(rsrcmgr, outfp, laparams=laparams,
imagewriter=imagewriter)
with open("pdf_sentence/"+fname, 'rb') as fp:
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,
caching=caching, check_extractable=True):
page.rotate = (page.rotate+rotation) % 360
time.sleep(1)
interpreter.process_page(page)
txt_list.append(outfile)
print(fname + " done !")
device.close()
outfp.close()
print(" device release")There is also a simple :
# Traverse all under the directory text file
import time
import os
txt_list = []
for root, dirs, files in os.walk("pdf_sentence/", topdown=False):
for i,name in enumerate(files):
end = name.split('.')[1]
if 'pdf'in end:
fname = name
outfile = name.split('.')[0]
re = os.system("python pdf2txt_self.py -o pdf_txt/%s.txt -d pdf_sentence/%s"%(outfile,name))
print("extract "+name+ " done!")Here we call pdf2txt.py
Take a look at his code :
#!/usr/bin/env python
import sys
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
from pdfminer.image import ImageWriter
# main
def main(argv):
import getopt
def usage():
print(f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]'
' [-O output_dir] [-c encoding] [-s scale] [-R rotation]'
' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]'
' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]'
' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...')
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:')
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# input option
password = b''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
imagewriter = None
rotation = 0
stripcontrol = False
layoutmode = 'normal'
encoding = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-P': password = v.encode('ascii')
elif k == '-o': outfile = v
elif k == '-t': outtype = v
elif k == '-O': imagewriter = ImageWriter(v)
elif k == '-c': encoding = v
elif k == '-s': scale = float(v)
elif k == '-R': rotation = int(v)
elif k == '-Y': layoutmode = v
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-S': stripcontrol = True
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
#
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFPageInterpreter.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = open(outfile, 'w', encoding=encoding)
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, laparams=laparams,
imagewriter=imagewriter,
stripcontrol=stripcontrol)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, scale=scale,
layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter, debug=debug)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp)
else:
return usage()
for fname in args:
with open(fname, 'rb') as fp:
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,
caching=caching, check_extractable=True):
page.rotate = (page.rotate+rotation) % 360
interpreter.process_page(page)
device.close()
outfp.close()
return
if __name__ == '__main__': sys.exit(main(sys.argv))
The code is written clearly , Slightly changed, DIU DIU ( Basically unchanged hahaha ) You can use it
#!/usr/bin/env python
import sys
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
from pdfminer.image import ImageWriter
# main
def main(argv):
import getopt
def usage():
print(f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]'
' [-O output_dir] [-c encoding] [-s scale] [-R rotation]'
' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]'
' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]'
' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...')
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:')
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# input option
password = b''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = 'text'
imagewriter = None
rotation = 0
stripcontrol = False
layoutmode = 'normal'
encoding = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-P': password = v.encode('ascii')
elif k == '-o': outfile = v
elif k == '-t': outtype = v
elif k == '-O': imagewriter = ImageWriter(v)
elif k == '-c': encoding = v
elif k == '-s': scale = float(v)
elif k == '-R': rotation = int(v)
elif k == '-Y': layoutmode = v
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-S': stripcontrol = True
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
#
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFPageInterpreter.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if outfile:
outfp = open(outfile, 'a+', encoding=encoding)
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, laparams=laparams,
imagewriter=imagewriter)
for fname in args:
with open(fname, 'rb') as fp:
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,
caching=caching, check_extractable=True):
page.rotate = (page.rotate+rotation) % 360
interpreter.process_page(page)
device.close()
outfp.close()
return
if __name__ == '__main__': sys.exit(main(sys.argv))
边栏推荐
- web测试学习笔记01
- MySQL high version report SQL_ mode=only_ full_ group_ By exception
- Coding technique - Global log switch
- C语言逆序输出字符串
- Yys mouse connector
- Time series ARIMA model
- 插入word中的图片保持高dpi方法
- 补充—整数规划例题
- 201403-1
- Mazak handwheel maintenance Mazak little giant CNC machine tool handle operator maintenance av-eahs-382-1
猜你喜欢

web测试学习笔记01

DRF learning notes (II): Data deserialization

Leetcode234 question - simple method to judge palindrome linked list

OpenCV(五)——运动目标识别

The new JMeter function assistant is not under the options menu - in the toolbar

training on multiple GPUs pytorch

EXE程序加密锁

Leetcode 226 翻转二叉树(递归)

Nacos

COMS Technology
随机推荐
Pychart imports the existing local installation package
MapReduce instance (III): data De duplication
JMeter5.3 及以后的版本jmeter函数助手生成的字符在置灰无法复制
最大子段和 Go 四种的四种求解
The method of inserting degree in word
MapReduce instance (I): wordcount
Analysis of PHP keyword replacement classes (avoid repeated replacement, keep and restore the original links)
201403-1
Coturn service installation in webrtc
Implementation of ByteDance service grid based on Hertz framework
清晰的认识Torchvision(思维导图版)
插入word中的图片保持高dpi方法
training on multiple GPUs pytorch
Rotate string left
CCF-201312-1
JWT简介
低代码是开发的未来吗?浅谈低代码平台
DRF learning notes (preparation)
新版jmeter函数助手不在选项菜单下-在工具栏中
Two methods of generating excel table with PHP