-
Notifications
You must be signed in to change notification settings - Fork 1
/
Pdf2Text.py
55 lines (45 loc) · 1.87 KB
/
Pdf2Text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import sys
import importlib
importlib.reload(sys)
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LTImage, LAParams
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
from Translate import translate
import fire
def parse(file_name, target_name):
fp = open(file_name, 'rb')
praser = PDFParser(fp)
doc = PDFDocument()
praser.set_document(doc)
doc.set_parser(praser)
doc.initialize()
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
else:
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
page_number = 1
for page in doc.get_pages():
print('page: ' + str(page_number))
interpreter.process_page(page)
layout = device.get_result()
# 这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象
# 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
# 想要获取文本就获得对象的text属性
for x in layout:
if (isinstance(x, LTTextBoxHorizontal)):
with open(target_name, 'a') as f:
results = x.get_text()
translate_text = translate(results)
f.write(translate_text + '\n')
# if (isinstance(x, LTImage)):
# with open('patternColoring.txt', 'a') as f:
# results = x.get_image()
# f.write('###########\n' + results + '\n')
page_number += 1
if __name__ == '__main__':
fire.Fire(parse)