MareArts Computer Vision Study.: pymupdf

Showing posts with label pymupdf. Show all posts

12/01/2021

pdf2img, pdf to image, python library

way #1

# pip install pdf2image
from pdf2image import convert_from_path
pdffile = '2081033884.pdf'
pages = convert_from_path(pdffile, 500)
#Saving pages in jpeg format
for i, page in enumerate(pages):
    page.save(f'pdf2image_{i}.jpg', 'JPEG')

way #2

#pip install pymupdf
import fitz
pdffile = '2081033884.pdf'
doc = fitz.open(pdffile)
#split pages
for i, page in enumerate(doc.pages()):
    pix = page.get_pixmap()
    img_filename = f"fitz_{i}.jpg"
    pix.pil_save(img_filename, format="jpeg", dpi=(300,300)) #, ... more PIL parameters)

Thank you.

www.marearts.com

🙇🏻‍♂️

5/02/2020

get image rect list from pdf

extract all image rect list from pdf using pymupdf
look at the sample code

..

#pip install PyMuPDF
#document : https://pymupdf.readthedocs.io/en/latest/

#pip install opencv-python
#github : https://github.com/skvark/opencv-python

import fitz

img_bbox = []
doc1 =fitz.open('test.pdf')
page1 = doc1[0] #first page

d = page1.getText("dict")
blocks = d["blocks"]
imgblocks = [b for b in blocks if b["type"] == 1]
for v in imgblocks:
    [x1, y1, x2, y2] = v['bbox']
    #print(x1, y1, x2, y2)
    img_bbox.append({'left':int(x1), 'top':int(y1), 'right':int(x2), 'bottom':int(y2)})

4/23/2020

remove all image from pdf file, python source code

input

output

PyMuPDF is needed

pip install PyMuPDF

def remove_img_on_pdf(idoc, page):
    #image list
    img_list = idoc.getPageImageList(page)
    con_list = idoc[page]._getContents()

    # xref 274 is the only /Contents object of the page (could be
    for i in con_list:
        c = idoc._getXrefStream(i) # read the stream source
        #print(c)
        if c != None:
            for v in img_list:
                
                arr = bytes(v[7], 'utf-8')
                r = c.find(arr) # try find the image display command
                if r != -1:
                    cnew = c.replace(arr, b"")
                    idoc._updateStream(i, cnew)
                    c = idoc._getXrefStream(i)
    return idoc


doc=fitz.open('example.PDF')
rdoc = remove_img_on_pdf(doc, 0) #first page
rdoc.save('no_img_example.PDF')

..

reference : https://github.com/pymupdf/PyMuPDF/issues/338

4/02/2020

PDF to OpenCV as page by page using PyMuPDF library (python example code)

Just see the below example code 😊

pip install PyMuPDF
document : https://pymupdf.readthedocs.io/en/latest/

I think this is better library than pypdf2 🤔
..

import fitz

import numpy as np

import cv2

fname = 'information-10-00248-v2'
doc = fitz.open(fname+'.pdf')

 #split pages

for i, page in enumerate(doc.pages()):

    print(i)

    zoom = 1

    mat = fitz.Matrix(zoom, zoom)

    pix = page.getPixmap(matrix = mat)

    imgData = pix.getImageData("png")

    

    #save image from byte

    f = open('./save_by_byte_{}_{}.png'.format(fname, i), 'wb')

    f.write(imgData)

    f.close()

    

    #save image from opencv

    nparr = np.frombuffer(imgData, np.uint8)

    img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)

    print(img.shape)

    cv2.imwrite('./save_by_opencv_{}_{}.png'.format(fname, i),img)

11/13/2019

PDF split page and save each page to pdf, python

#manual https://pymupdf.readthedocs.io/en/latest/

#pip3 install PyMuPDF

import fitz

doc = fitz.open('./test2.pdf')

#page_number = doc.pageCount

#print(page_number)

#split pages

for i, page in enumerate(doc.pages()):

print(i)

doc2 = fitz.open()

doc2.insertPDF(doc, to_page = i)

doc2.save("{}.pdf".format(i))

Pages