MareArts Computer Vision Study.: 2020.04

4/23/2020

remove all image from pdf file, python source code

input

output

PyMuPDF is needed

pip install PyMuPDF

def remove_img_on_pdf(idoc, page):
    #image list
    img_list = idoc.getPageImageList(page)
    con_list = idoc[page]._getContents()

    # xref 274 is the only /Contents object of the page (could be
    for i in con_list:
        c = idoc._getXrefStream(i) # read the stream source
        #print(c)
        if c != None:
            for v in img_list:
                
                arr = bytes(v[7], 'utf-8')
                r = c.find(arr) # try find the image display command
                if r != -1:
                    cnew = c.replace(arr, b"")
                    idoc._updateStream(i, cnew)
                    c = idoc._getXrefStream(i)
    return idoc


doc=fitz.open('example.PDF')
rdoc = remove_img_on_pdf(doc, 0) #first page
rdoc.save('no_img_example.PDF')

..

reference : https://github.com/pymupdf/PyMuPDF/issues/338

4/16/2020

Python OpenCV Image to byte string for json transfer

code :

import cv2
import base64
import json
import numpy as np

######################################################
#read image
img = cv2.imread('./code_backup/test_img.jpg')
#cv2 to string
image_string = cv2.imencode('.jpg', img)[1]
image_string = base64.b64encode(image_string).decode()
#make string image dict
dict = {'img':image_string}
#save dict to json file
with open('./code_backup/cv2string.json', 'w') as fp:
    json.dump(dict, fp, indent=5)
######################################################


######################################################
#read json
response = json.loads(open('./code_backup/cv2string.json', 'r').read())
#get image string
string = response['img']
#convert string to image
jpg_original = base64.b64decode(string)
jpg_as_np = np.frombuffer(jpg_original, dtype=np.uint8)
img = cv2.imdecode(jpg_as_np, flags=1)
#show image
cv2.imshow('show image', img)
cv2.waitKey(0)
######################################################

..

this is input image

this is summarised json file

{ "img": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQECAgICAgQDAgICAgUEBAMEBgUGBgYFBgYGBwkIBgcJBwYGCAsICQoKCgoKBggLDAsKDAkKCgr/2wBDAQICAgICAgUDAwUKBwYHCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgr/wAARCAFoAeADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD2BP8Ag34/4JIE4P7I4/8AC617/wCTqsJ/wb4/8EiTx/wyR/5fWv8A/wAnV9nPH2x+FSwx14/t6vc+i+r4f+Q+Mk/4N7P+CQ+P+TRx/wC ........."

}

4/03/2020

Example python code for : Download s3 object as opencv image in memory and upload too

Just see the code
It's not difficult.

...

...

import cv2

import numpy as np

...

def lambda_handler(event, context):
    # TODO implement
    bucket_name = event['Records'][0]['s3']['bucket']['name']
    s3_path = event['Records'][0]['s3']['object']['key']
    
    #download object
    obj = s3_client.get_object(Bucket=bucket_name, Key=s3_path)
    
    #obj to cv2
    nparr = np.frombuffer(obj['Body'].read(), np.uint8)
    img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
    
    #simple image processing
    reimg = cv2.resize(img, (100,100) )
    
    #cv2 to string
    image_string = cv2.imencode('.png', reimg)[1].tostring()
    
    #upload
    s3_client.put_object(Bucket='thum-prj-output', Key = s3_path, Body=image_string)
    
...

...

4/02/2020

PDF to OpenCV as page by page using PyMuPDF library (python example code)

Just see the below example code 😊

pip install PyMuPDF
document : https://pymupdf.readthedocs.io/en/latest/

I think this is better library than pypdf2 🤔
..

import fitz

import numpy as np

import cv2

fname = 'information-10-00248-v2'
doc = fitz.open(fname+'.pdf')

 #split pages

for i, page in enumerate(doc.pages()):

    print(i)

    zoom = 1

    mat = fitz.Matrix(zoom, zoom)

    pix = page.getPixmap(matrix = mat)

    imgData = pix.getImageData("png")

    

    #save image from byte

    f = open('./save_by_byte_{}_{}.png'.format(fname, i), 'wb')

    f.write(imgData)

    f.close()

    

    #save image from opencv

    nparr = np.frombuffer(imgData, np.uint8)

    img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)

    print(img.shape)

    cv2.imwrite('./save_by_opencv_{}_{}.png'.format(fname, i),img)

MareArts Computer Vision Study.

Pages