4/23/2020

remove all image from pdf file, python source code

input
output


PyMuPDF is needed
pip install PyMuPDF
..

def remove_img_on_pdf(idoc, page):
#image list
img_list = idoc.getPageImageList(page)
con_list = idoc[page]._getContents()

# xref 274 is the only /Contents object of the page (could be
for i in con_list:
c = idoc._getXrefStream(i) # read the stream source
#print(c)
if c != None:
for v in img_list:
arr = bytes(v[7], 'utf-8')
r = c.find(arr) # try find the image display command
if r != -1:
cnew = c.replace(arr, b"")
idoc._updateStream(i, cnew)
c = idoc._getXrefStream(i)
return idoc


doc=fitz.open('example.PDF')
rdoc = remove_img_on_pdf(doc, 0) #first page
rdoc.save('no_img_example.PDF')
..

reference : https://github.com/pymupdf/PyMuPDF/issues/338