10/09/2020

draw roc curve using python sklearn, Matplotlib

import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn import metrics


gt = [1, 0, 1, 0, 1, 1] #origin
pre = [0.9, 0.5, 0.8, 0.4, 0.5, 0.8] #predict
fpr, tpr, thresholds = metrics.roc_curve(gt, pre)
roc_auc = metrics.auc(fpr, tpr)

fig, ax = plt.subplots(figsize=(10,7))
ax.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
ax.plot(np.linspace(0, 1, 100),
np.linspace(0, 1, 100),
label='baseline',
linestyle='--')
plt.title('Receiver Operating Characteristic Curve', fontsize=18)
plt.ylabel('TPR', fontsize=16)
plt.xlabel('FPR', fontsize=16)
plt.legend(fontsize=12









10/08/2020

print gpu memory status in python

*install pynvml

https://pypi.org/project/pynvml/

pip install pynvml


*use below code in python code

from pynvml import *
nvmlInit()
h = nvmlDeviceGetHandleByIndex(0)
info = nvmlDeviceGetMemoryInfo(h)
print(f'total    : {info.total}')
print(f'free     : {info.free}')
print(f'used     : {info.used}')


10/06/2020

remove duplicated tuple item in list (python code)

 

print(tuple_list)
tuple_list = [ tuple(sorted(tuple_list[i])) for i in range(len(tuple_list))]
tuple_list = list(set(tuple_list))
print(tuple_list)


before:

[(0, 0), (0, 1), (0, 3), (1, 0), (1, 1), (1, 2), (2, 1), (2, 2), (2, 3), (3, 0), (3, 2), (3, 3)]

After:
[(0, 1), (1, 2), (0, 0), (3, 3), (2, 3), (2, 2), (0, 3), (1, 1)]



9/23/2020

Pytorch, Infinite DataLoader using iter & next

 


# create dataloader-iterator
data_iter = iter(data_loader)

# iterate over dataset
# alternatively you could use while(True)
for i in range(NUM_ITERS_YOU_WANT)
try:
data = next(data_iter)
except StopIteration:
# StopIteration is thrown if dataset ends
# reinitialize data loader
data_iter = iter(data_loader)
data = next(data_iter)

python argparse example


import argparse

paser = argparse.ArgumentParser()
args = paser.parse_args("")
args.cuda = False
args.show_summary = False
args.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')


print(args.cuda)


9/21/2020

find best (optimal) threshold using roc curve

 def plot_roc_curve(fpr, tpr):

    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

y_true = np.array([0,0, 1, 1,1])
y_scores = np.array([0.0,0.09, .05, .75,1])

fpr, tpr, thresholds = roc_curve(y_true, y_scores)
print(tpr)
print(fpr)
print(thresholds)
print(roc_auc_score(y_true, y_scores))
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("Threshold value is:", optimal_threshold)
plot_roc_curve(fpr, tpr)

What AUC(area under curve) value is better ?

 What AUC(area under curve) value is better ?

0.9 ~ 1 : excellent
0.8 ~ 0.9: good
0.7 ~ 0.8 : normal
0.6 ~ 0.7 : poor
0.5 ~ 0.6 : fail


python measure processing time

 


from time import process_time
# Start the stopwatch / counter
t1_start = process_time()

###
#processing
###

# Stop the stopwatch / counter
t1_stop = process_time()
sec = t1_stop-t1_start


9/20/2020

split train test dataset

 


import random

from sklearn.model_selection import train_test_split

random.shuffle(pkl_list)

pkl_train, pkl_test = train_test_split(pkl_list, test_size=0.2)


show image in jupyter notebook

 

from matplotlib import pyplot as plt
import numpy as np
import cv2

img = imread('xxx.png') #or image_data
img2 = img[:,:,::-1]
plt.imshow(img)


fix hangul separating issue in mac

 

from unicodedata import normalize
def nfd2nfc(data):
return normalize('NFC', data)


normalize('ㄷ ㅓ')

-> 더 


python change file name, get file name, dir, ext, check file exist in source code using os package

 

get file name and ext

import os
os.path.splitext("/path/to/some/file.txt")[0]
#/path/to/some/file
base = os.path.basename('/root/dir/sub/file.ext')
#'file.ext'
os.path.splitext(base)
#('file', '.ext')
os.path.splitext(base)[0]
#'file'

get dir

os.path.dirname("/path/to/some/file.txt")
#'/path/to/some'

change file name 

os.rename(r'C:\Users\Ron\Desktop\Test\Products.txt',r'C:\Users\Ron\Desktop\Test\Shipped Products.txt')


check file exist

os.path.isfile('./path_of_file')




9/18/2020

sparse tensor to csr_matrix

from scipy.sparse import csr_matrix
import numpy as np


x = val_data.x
dim = len(x)
print(dim)
edge_index = val_data.edge_index
print(edge_index) #sparse tensor
row = edge_index[0].numpy()
col = edge_index[1].numpy()
edge_num = len(row)
data = np.ones( edge_num )
mtx = csr_matrix((data, (row, col)), shape=(dim, dim))
#print( type(mtx.toarray()), mtx.toarray().shape)
print( mtx.toarray(), type(mtx.toarray()), mtx.toarray().shape) 



let's image 

val_data.x is node features ex) 13x1000

val_data.edge_index is sparse edge index stored by torch tensor


now we want to convert it to csr_matrix

The above code is example for this case.


The print out is like this:

tensor([[ 0,  0,  0,  1,  1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,  3,  4,
          4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  7,  7,  7,
          8,  8,  8,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11,
         11, 11, 12, 12, 12, 12],
        [ 1,  3, 10,  0,  2,  3, 10, 11,  1,  3, 11, 12,  0,  1,  2, 11, 12,  5,
          6,  8,  9, 11, 12,  4,  6,  7,  8,  9,  4,  5,  7,  9, 10,  5,  6,  8,
          4,  5,  7,  4,  5,  6, 10, 11,  0,  1,  6,  9, 11,  1,  2,  3,  4,  9,
         10, 12,  2,  3,  4, 11]])
[[0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0.]
 [0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1.]
 [1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.]
 [0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 1.]
 [0. 0. 0. 0. 1. 0. 1. 1. 1. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 1. 0. 1. 0. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 1. 0.]
 [1. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0.]
 [0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 0. 1.]
 [0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0.]] <class 'numpy.ndarray'> (13, 13)


Thank you
Enjoy Pytorch!


9/07/2020

image augmentation by python

pip install imgaug
pip install imagecorruptions

github : https://github.com/aleju/imgaug


import numpy as np
import imgaug as ia
import imgaug.augmenters as iaa
import cv2

def agument_rewrite(file_list):

sometimes = lambda aug: iaa.Sometimes(0.1, aug)
seq = iaa.Sequential(
[
# apply the following augmenters to most images
sometimes(iaa.CropAndPad(percent=(-0.02, 0.02), pad_mode=ia.ALL, pad_cval=(0, 255))),
sometimes(iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255), per_channel=0.5)), # add gaussian noise to images
sometimes(iaa.Dropout(p=(0, 0.2))),
sometimes(iaa.CoarseDropout(0.02, size_percent=0.15, per_channel=0.5)),
sometimes(iaa.Solarize(0.5, threshold=(32, 128))),
sometimes(iaa.Cartoon()),
sometimes(iaa.MotionBlur(k=15)),
sometimes(iaa.AllChannelsCLAHE()),
sometimes(iaa.Emboss(alpha=(0.0, 1.0), strength=(0.5, 1.5))),
sometimes(iaa.ElasticTransformation(alpha=(0, 5.0), sigma=0.25)),
sometimes(iaa.PiecewiseAffine(scale=(0.01, 0.05))),
sometimes(iaa.imgcorruptlike.Snow(severity=2)),
sometimes(iaa.Superpixels(p_replace=0.3, n_segments=500)),
sometimes(iaa.Rain(speed=(0.1, 0.3))),
sometimes(iaa.Snowflakes(flake_size=(0.1, 0.4), speed=(0.01, 0.05))),
sometimes(iaa.Fog()),
],
random_order=True
)

for i, v in enumerate(file_list):
img = cv2.imread(v)
images_aug = seq(images=[img])[0] # done by the library
cv2.imwrite(v, images_aug)
print('{}/{} aug : {}'.format(i, len(file_list), v))

9/06/2020

Fix indention in VS code

 

  • On Windows Shift + Alt + F
  • On Mac Shift + Option + F
  • On Linux Ctrl + Shift + I


8/20/2020

8/18/2020

How to fix Python SSL CERTIFICATE_VERIFY_FAILED

 put this code on the top of code line:


import os, ssl
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
getattr(ssl, '_create_unverified_context', None)):
ssl._create_default_https_context = ssl._create_unverified_context

Get list from dir and separate train and test (python function)


from sklearn.model_selection import train_test_split
import random
import os
import glob

def train_test_split_from_dir(origin_dir, test_size=0.2):
os.chdir(origin_dir)
#get list
data_list = []
for file in glob.glob("*.jpg"):
data_list.append(file)
random.shuffle(data_list)
train_json_list, test_json_list = train_test_split(data_list, test_size=test_size)

return train_json_list, test_json_list

7/29/2020

ROC & AUC example code in face detector model case



..

#https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt

#model #1
y = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
scores = np.array([0.64, 0.47, 0.46, 0.77, 0.72, 0.9, 0.85, 0.7, 0.87, 0.92, 0.89, 0.93, 0.85, 0.81, 0.88, 0.48, 0.1, 0.35, 0.68, 0.47])
fpr, tpr, thresholds = metrics.roc_curve(y, scores)
roc_auc = metrics.auc(fpr, tpr)

# plot
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

..


7/28/2020

Example model metrics using sklearn in face detector case


..

from sklearn.metrics import classification_report
#model 1
y_true = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
y_pred = [0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
target_names = ['Non Face', 'Face']
print(classification_report(y_true, y_pred, target_names=target_names, digits=3))
..



..

#model 2
y_true = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
y_pred = [0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
target_names = ['Non Face', 'Face']
print(classification_report(y_true, y_pred, target_names=target_names, digits=3))
..


7/07/2020

extract year, month, day from file on Ubuntu, python example


...
import os, time
date_created_obj = time.localtime(os.path.getctime(full_path))
print('Year: {:4d}'.format(date_created_obj.tm_year)) # Year: 2020
print('Month: {:2d}'.format(date_created_obj.tm_mon)) # Month: 2
print('Day: {:2d}'.format(date_created_obj.tm_mday)) # Day: 10

...


7/06/2020

how to merge two csr_matrix, example python source code

let's see the code.

..
from scipy.sparse import csr_matrix
import numpy as np

#first matrix
row = np.array([0, 0, 1, 2, 2, 2])
col = np.array([0, 2, 2, 0, 1, 2])
data = np.array([1, 1, 1, 1, 1, 1])
mtx = csr_matrix((data, (row, col)), shape=(3, 3))

#second matrix
row = np.array([0, 0, 1, 2, 2, 2])
col = np.array([0, 1, 2, 0, 1, 2])
data = np.array([1, 1, 1, 1, 1, 1])
mtx2 = csr_matrix((data, (row, col)), shape=(3, 3))

#merge two matrix
mtx3 = merge_two_csr_mtx(mtx, mtx2)

#check
print('1st\n',mtx)
print('2nd\n',mtx2)
print('merge\n',mtx3)
..

result
1st
   (0, 0) 1
  (0, 2) 1
  (1, 2) 1
  (2, 0) 1
  (2, 1) 1
  (2, 2) 1
2nd
   (0, 0) 1
  (0, 1) 1
  (1, 2) 1
  (2, 0) 1
  (2, 1) 1
  (2, 2) 1
merge
   (0, 0) 2.0
  (0, 1) 1.0
  (0, 2) 1.0
  (1, 2) 2.0
  (2, 0) 2.0
  (2, 1) 2.0
  (2, 2) 2.0

How to convert a scipy csr_matrix back into lists of row, col and data?

refer to code


..
Define matrix & check values
from scipy.sparse import csr_matrix
import numpy as np
row = np.array([0, 0, 1, 2, 2, 2])
col = np.array([0, 1, 2, 0, 1, 2])
data = np.array([1, 1, 1, 1, 1, 1])
mtx2 = csr_matrix((data, (row, col)), shape=(3, 3))
print(mtx2) #matrix print out
print(mtx2.toarray()) #print out by array

>
(0, 0) 1
  (0, 1) 1
  (1, 2) 1
  (2, 0) 1
  (2, 1) 1
  (2, 2) 1
>
[[1 1 0]
 [0 0 1]
 [1 1 1]]
..


...
get back the row, col and data value from matrix
c = mtx2.tocoo()
print(c.row)
print(c.col)
print(c.data)

>
[0 0 1 2 2 2]
[0 1 2 0 1 2]
[1 1 1 1 1 1]
...

6/09/2020

sentence embedding, sentence to vector using bert

refer to source code

.
#pip install -U sentence-transformers
#https://github.com/UKPLab/sentence-transformers
from sentence_transformers import SentenceTransformer, LoggingHandler

# Load Sentence model (based on BERT) from URL
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Embed a list of sentences
sentences = ['This framework generates embeddings for each input sentence',
'Sentences are passed as a list of string.',
'The quick brown fox jumps over the lazy dog.']
sentence_embeddings = model.encode(sentences)

# The result is a list of sentence embeddings as numpy arrays
for sentence, embedding in zip(sentences, sentence_embeddings):
print("Sentence:", sentence)
print("Embedding:", embedding.shape, type(embedding))
print("")
.

result is like this:
Sentence: This framework generates embeddings for each input sentence
Embedding: (768,) <class 'numpy.ndarray'>

Sentence: Sentences are passed as a list of string.
Embedding: (768,) <class 'numpy.ndarray'>

Sentence: The quick brown fox jumps over the lazy dog.
Embedding: (768,) <class 'numpy.ndarray'>

5/25/2020

install poppler in ubuntu

Try to this command:

sudo apt-get update -y
sudo apt-get install -y poppler-utils

😁

5/19/2020

Ways to sort list of dictionaries by values in Python – Using lambda function


.
#example list
dict_list = [{ "idx":1, "value1":32.44, "value2":123.2}, { "idx":2, "value1":32.414, "value2":133.2}, { "idx":3, "value1":32.244, "value2":113.2}]

#sort by ascending order
sorted_dict_list = sorted(dict_list, key = lambda i: i['value1'])
#sort by descending order
r_sorted_dict_list = sorted(dict_list, key = lambda i: i['value1'],reverse=True)

#show result
print(sorted_dict_list)
# [{'idx': 3, 'value1': 32.244, 'value2': 113.2}, {'idx': 2, 'value1': 32.414, 'value2': 133.2}, {'idx': 1, 'value1': 32.44, 'value2': 123.2}]

print(r_sorted_dict_list)
# [{'idx': 1, 'value1': 32.44, 'value2': 123.2}, {'idx': 2, 'value1': 32.414, 'value2': 133.2}, {'idx': 3, 'value1': 32.244, 'value2': 113.2}]
.


5/15/2020

multi-thread example python source code

The code generate 10 multi threads for running single_function.
If you have look the pid in result, thread is finished by quickly proceeded.

..
import queue
from concurrent.futures import ThreadPoolExecutor

#function for thread
def single_function(input, pid, out_queue):
total = 0
for i in range(0,input):
for j in range(0, input):
for k in range(0, input):
total = total + 1

out_queue.put( {'index':pid, 'result':total })
#run thread
my_queue = queue.Queue()
with ThreadPoolExecutor(max_workers=10) as executor:
for pid in range(0, 10):
executor.submit(single_function, 100, pid, my_queue)
#get result of each thread
result = {}
while not my_queue.empty():
get = my_queue.get()
print(get)

#finish all thread
..

result

{'index': 1, 'result': 1000000}
{'index': 3, 'result': 1000000}
{'index': 2, 'result': 1000000}
{'index': 0, 'result': 1000000}
{'index': 5, 'result': 1000000}
{'index': 4, 'result': 1000000}
{'index': 8, 'result': 1000000}
{'index': 6, 'result': 1000000}
{'index': 9, 'result': 1000000}
{'index': 7, 'result': 1000000}

5/02/2020

get image rect list from pdf

extract all image rect list from pdf using pymupdf
look at the sample code

..

#pip install PyMuPDF
#document : https://pymupdf.readthedocs.io/en/latest/

#pip install opencv-python
#github : https://github.com/skvark/opencv-python

import fitz

img_bbox = []
doc1 =fitz.open('test.pdf')
page1 = doc1[0] #first page

d = page1.getText("dict")
blocks = d["blocks"]
imgblocks = [b for b in blocks if b["type"] == 1]
for v in imgblocks:
[x1, y1, x2, y2] = v['bbox']
#print(x1, y1, x2, y2)
img_bbox.append({'left':int(x1), 'top':int(y1), 'right':int(x2), 'bottom':int(y2)})
..

4/23/2020

remove all image from pdf file, python source code

input
output


PyMuPDF is needed
pip install PyMuPDF
..

def remove_img_on_pdf(idoc, page):
#image list
img_list = idoc.getPageImageList(page)
con_list = idoc[page]._getContents()

# xref 274 is the only /Contents object of the page (could be
for i in con_list:
c = idoc._getXrefStream(i) # read the stream source
#print(c)
if c != None:
for v in img_list:
arr = bytes(v[7], 'utf-8')
r = c.find(arr) # try find the image display command
if r != -1:
cnew = c.replace(arr, b"")
idoc._updateStream(i, cnew)
c = idoc._getXrefStream(i)
return idoc


doc=fitz.open('example.PDF')
rdoc = remove_img_on_pdf(doc, 0) #first page
rdoc.save('no_img_example.PDF')
..

reference : https://github.com/pymupdf/PyMuPDF/issues/338



4/16/2020

Python OpenCV Image to byte string for json transfer

code :
import cv2
import base64
import json
import numpy as np

######################################################
#read image
img = cv2.imread('./code_backup/test_img.jpg')
#cv2 to string
image_string = cv2.imencode('.jpg', img)[1]
image_string = base64.b64encode(image_string).decode()
#make string image dict
dict = {'img':image_string}
#save dict to json file
with open('./code_backup/cv2string.json', 'w') as fp:
json.dump(dict, fp, indent=5)
######################################################


######################################################
#read json
response = json.loads(open('./code_backup/cv2string.json', 'r').read())
#get image string
string = response['img']
#convert string to image
jpg_original = base64.b64decode(string)
jpg_as_np = np.frombuffer(jpg_original, dtype=np.uint8)
img = cv2.imdecode(jpg_as_np, flags=1)
#show image
cv2.imshow('show image', img)
cv2.waitKey(0)
######################################################
..

this is input image

this is summarised json file
{ "img": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQECAgICAgQDAgICAgUEBAMEBgUGBgYFBgYGBwkIBgcJBwYGCAsICQoKCgoKBggLDAsKDAkKCgr/2wBDAQICAgICAgUDAwUKBwYHCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgr/wAARCAFoAeADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD2BP8Ag34/4JIE4P7I4/8AC617/wCTqsJ/wb4/8EiTx/wyR/5fWv8A/wAnV9nPH2x+FSwx14/t6vc+i+r4f+Q+Mk/4N7P+CQ+P+TRx/wC ........."
}

4/03/2020

Example python code for : Download s3 object as opencv image in memory and upload too

Just see the code
It's not difficult.

...

...
import cv2
import numpy as np
...

def lambda_handler(event, context):
# TODO implement
bucket_name = event['Records'][0]['s3']['bucket']['name']
s3_path = event['Records'][0]['s3']['object']['key']
#download object
obj = s3_client.get_object(Bucket=bucket_name, Key=s3_path)
#obj to cv2
nparr = np.frombuffer(obj['Body'].read(), np.uint8)
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
#simple image processing
reimg = cv2.resize(img, (100,100) )
#cv2 to string
image_string = cv2.imencode('.png', reimg)[1].tostring()
#upload
s3_client.put_object(Bucket='thum-prj-output', Key = s3_path, Body=image_string)
...

...

4/02/2020

PDF to OpenCV as page by page using PyMuPDF library (python example code)

Just see the below example code 😊

pip install PyMuPDF
document : https://pymupdf.readthedocs.io/en/latest/

I think this is better library than pypdf2 🤔
..

import fitz
import numpy as np
import cv2
fname = 'information-10-00248-v2'
doc = fitz.open(fname+'.pdf')

#split pages
for i, page in enumerate(doc.pages()):
print(i)
zoom = 1
mat = fitz.Matrix(zoom, zoom)
pix = page.getPixmap(matrix = mat)
imgData = pix.getImageData("png")
 
#save image from byte
f = open('./save_by_byte_{}_{}.png'.format(fname, i), 'wb')
f.write(imgData)
f.close()
 
#save image from opencv
nparr = np.frombuffer(imgData, np.uint8)
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
print(img.shape)
cv2.imwrite('./save_by_opencv_{}_{}.png'.format(fname, i),img)

..