convert ckpt -> bin -> onnx, + quantize, this example code is based on layoutlm v3

 refer to code.


# https://github.com/huggingface/optimum
# python -m pip install optimum[onnxruntime]
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForTokenClassification
from layoutlmv3_model import layoutlmv3_ner_model
import pickle

def layoutlmv3_vvv3():
base_path = 'layoutLM_research/tb_logs/model/M_microsoft-layoutlmv3-large_T_stride_V_v8/checkpoints/'
#load ckpt model
ckpt_path = base_path + 'vvv0.99061.ckpt'
bin_path = base_path + "bin"
onnx_path = base_path + "onnx"
quantizer_onnx_directory = base_path + "onnx_q"
#load ckpt model
base_model = layoutlmv3_ner_model.load_from_checkpoint(ckpt_path)
#save model params
with open(base_path+'model_cfg.pkl', 'wb') as fout:
pickle.dump(base_model.cfg, fout, protocol=2)
#save transformer model to bin

# Load a model from transformers and export it to ONNX
ort_model = ORTModelForTokenClassification.from_pretrained(bin_path, from_transformers=True)
# Save the ONNX model and tokenizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
from optimum.onnxruntime import ORTQuantizer
# Define the quantization methodology
qconfig = AutoQuantizationConfig.arm64(is_static=False, per_channel=False)
quantizer = ORTQuantizer.from_pretrained(ort_model)
# Apply dynamic quantization on the model
quantizer.quantize(save_dir=quantizer_onnx_directory, quantization_config=qconfig)

if __name__ == "__main__":


some specific code may not suitable your case.

but general concept would be same.

Thank you.