-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpackpdf.py
156 lines (124 loc) · 3.92 KB
/
packpdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# coding: utf-8
# PDF 高清锐化
# 或者图片锐化打包 PDF
# python packpdf.py <pdf|dir> [-s <size>]
import numpy as np
import cv2
import sys
import os
from os import path
import re
import tempfile
import time
import argparse
import shutil
import fitz
import img2pdf
from imgyaso import adathres
import subprocess as subp
import platform
import uuid
def dump_pdf(fname, dir):
RE_XOBJ = r"/Type\s*/XObject"
RE_IMG = r"/Subtype\s*/Image"
doc = fitz.open(fname)
img_idcs = []
for i in range(1, doc.xref_length()):
xref = doc.xref_object(i)
is_xobj = re.search(RE_XOBJ, xref)
is_img = re.search(RE_IMG, xref)
if is_xobj and is_img: img_idcs.append(i)
l = len(str(len(img_idcs) - 1))
for i, j in enumerate(img_idcs):
print(f'no: {i}, xref: {j}')
img = fitz.Pixmap(doc, j)
if img.n >= 5:
pix = fitz.Pixmap(fitz.csRGB, pix)
imgname = path.join(dir, f'{i:0{l}d}.png')
img.writePNG(imgname)
doc.close()
'''
imgs = convert_from_path(fname)
l = len(str(len(imgs) - 1))
for i, img in enumerate(imgs):
picname = path.join(dir, f'{i:0{l}d}.png')
img.save(picname, 'png')
'''
def gen_pdf(fname, dir):
files = filter(is_pic, os.listdir(dir))
files = list(map(lambda s: path.join(dir, s), files))
pdf = img2pdf.convert(files)
with open(fname, 'wb') as f:
f.write(pdf)
def process_img(img, size=0, deskew=False):
# firstly deskew
if deskew:
img = magick_deskew(img)
img = np.frombuffer(img, np.uint8)
img = cv2.imdecode(img, cv2.IMREAD_GRAYSCALE)
# check grayscale
assert img.ndim == 2
# resize
if size:
nw = size
h, w = img.shape
rate = nw / w
nh = round(h * rate)
img = cv2.resize(img, (nw, nh), interpolation=cv2.INTER_CUBIC)
# bw
img = adathres(img, 9)
img = cv2.imencode('.png', img, [
cv2.IMWRITE_PNG_BILEVEL, 1,
])[1]
return bytes(img)
is_pic = lambda s: s.endswith('.jpg') or \
s.endswith('.png') or \
s.endswith('.gif')
def process_dir(args):
dir = args.fname
if dir.endswith('\\') or dir.endswith('/'): dir = dir[:-1]
p = path.join(path.dirname(dir), path.basename(dir) + '.pdf')
if path.exists(p):
print('文件已存在')
return
files = filter(is_pic, os.listdir(dir))
for f in files:
print(f)
f = path.join(dir, f)
img = open(f, 'rb').read()
img = process_img(img, size=args.size, deskew=args.deskew)
open(f, 'wb').write(img)
gen_pdf(p, dir)
def magick_deskew(img):
fname = path.join(tempfile.gettempdir(), uuid.uuid4().hex + '.png')
open(fname, 'wb').write(img)
cmd = f'convert "{fname}" -deskew 40% "{fname}"'
if platform.system().lower() == 'windows':
cmd = 'magick ' + cmd
subp.Popen(cmd, shell=True).communicate()
return open(fname, 'rb').read()
def process_pdf(args):
dir = path.join(tempfile.gettempdir(), str(time.time()))
os.mkdir(dir)
fname = args.fname
print(f'{fname} 导出中...')
dump_pdf(fname, dir)
print(f'{fname} 导出完毕')
args.fname = dir
process_dir(args)
shutil.rmtree(dir)
os.rename(fname, fname + '.bak')
shutil.move(dir + '.pdf', fname)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("fname", help="pdf or dir name")
parser.add_argument("-s", "--size", type=int, default=0, help="width of pics")
parser.add_argument("-d", "--deskew", action='store_true', help="deskew or not")
args = parser.parse_args()
if path.isdir(args.fname):
process_dir(args)
elif args.fname.endswith('.pdf'):
process_pdf(args)
else:
print('请提供目录或 PDF')
if __name__ == '__main__': main()