7/04/2023

Tokeniser example source code using "BertWordPieceTokenizer", "sentencepiece"


BertWordPieceTokenizer Toeknizer code

.

import os
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer(strip_accents=False, lowercase=False)

corpus_file = ['./ratings.txt'] # data path
vocab_size = 32000 #vocab의 크기. 보톡 32,000이 μ’‹λ‹€κ³  μ•Œλ €μ§.
limit_alphabet= 6000 #merge μˆ˜ν–‰ μ „ initial tokens이 μœ μ§€λ˜λŠ” 숫자 μ œν•œ
output_path = 'hugging_%d'%(vocab_size)
min_frequency = 5 # λ‹¨μ–΄μ˜ μ΅œμ†Œ λ°œμƒ λΉˆλ„
hf_model_path = './'

tokenizer.train(files=corpus_file,
vocab_size=vocab_size,
min_frequency=min_frequency,
limit_alphabet=limit_alphabet,
show_progress=True)

tokenizer.save_model(hf_model_path)

..


BertWordPiece Tokenizer test

.

from transformers import BertTokenizerFast

hf_model_path = './'
tokenizer = BertTokenizerFast.from_pretrained(hf_model_path, strip_accents=False,
lowercase = False)

text = "넀이버 μ˜ν™” 평가 λ¬Έμž₯으둜 ν† ν¬λ‚˜μ΄μ €"
tokenized_input_for_pytorch = tokenizer(text, return_tensors='pt')

print("Tokens (str) : {}".format([tokenizer.convert_ids_to_tokens(s) for s in tokenized_input_for_pytorch['input_ids'].tolist()[0]]))
print("Tokens (int) : {}".format(tokenized_input_for_pytorch['input_ids'].tolist()[0]))
print("Tokens (attn_mask): {}\n".format(tokenized_input_for_pytorch['attention_mask'].tolist()[0]))

# Tokens (str) : ['[CLS]', '넀이버', 'μ˜ν™”', '평가', 'λ¬Έ', '##μž₯', '##으둜', 'ν† ', '##크', '##λ‚˜μ΄', '##μ €', '[SEP]']
# Tokens (int) : [2, 6818, 5834, 6947, 1528, 3340, 5842, 2899, 3390, 8801, 3755, 3]
# Tokens (attn_mask): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

..


Sentence piece Tokenizer

.

import sentencepiece as spm
import os

input_file = './ratings.txt'
vocab_size = 32000

sp_model_root = 'sentencepiece'
if not os.path.isdir(sp_model_root):
os.mkdir(sp_model_root)
sp_model_name = 'tokenizer_%d' % (vocab_size)
sp_model_path = os.path.join(sp_model_root, sp_model_name)
model_type = 'unigram' #unigram, bpe
character_coverage = 1.0 #default=0.9995
user_defined_symbols = '[PAD],[UNK],[CLS],[SEP],[MASK],[BOS],[EOS],[UNK0],[UNK1],[UNK2],[UNK3],[UNK4],[UNK5],[UNK6],[UNK7],[UNK8],[UNK9],[unused0],[unused1],[unused2],[unused3],[unused4],[unused5],[unused6],[unused7],[unused8],[unused9],[unused10],[unused11],[unused12],[unused13],[unused14],[unused15],[unused16],[unused17],[unused18],[unused19],[unused20],[unused21],[unused22],[unused23],[unused24],[unused25],[unused26],[unused27],[unused28],[unused29],[unused30],[unused31],[unused32],[unused33],[unused34],[unused35],[unused36],[unused37],[unused38],[unused39],[unused40],[unused41],[unused42],[unused43],[unused44],[unused45],[unused46],[unused47],[unused48],[unused49],[unused50],[unused51],[unused52],[unused53],[unused54],[unused55],[unused56],[unused57],[unused58],[unused59],[unused60],[unused61],[unused62],[unused63],[unused64],[unused65],[unused66],[unused67],[unused68],[unused69],[unused70],[unused71],[unused72],[unused73],[unused74],[unused75],[unused76],[unused77],[unused78],[unused79],[unused80],[unused81],[unused82],[unused83],[unused84],[unused85],[unused86],[unused87],[unused88],[unused89],[unused90],[unused91],[unused92],[unused93],[unused94],[unused95],[unused96],[unused97],[unused98],[unused99]'

input_argument = '--input=%s --model_prefix=%s --vocab_size=%s --user_defined_symbols=%s --model_type=%s --character_coverage=%s'
cmd = input_argument%(input_file, sp_model_path, vocab_size,user_defined_symbols, model_type, character_coverage)

spm.SentencePieceTrainer.Train(cmd)

..


Sentence piece Tokenizer test

.

import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load('{}.model'.format('./sentencepiece/tokenizer_32000'))

text = "넀이버 μ˜ν™” 평가 λ¬Έμž₯으둜 ν† ν¬λ‚˜μ΄μ €"
tokens = sp.encode_as_pieces(text)
ids = sp.encode_as_ids(text)

print("Tokens (str) : {}".format(tokens))
print("Tokens (int) : {}".format(ids))

# Tokens (str) : ['▁넀이버', '▁μ˜ν™”', '▁평가', '▁λ¬Έ', 'μž₯', '으둜', '▁', '토크', 'λ‚˜μ΄', 'μ €']
# Tokens (int) : [1209, 126, 2353, 3552, 412, 166, 123, 22627, 6361, 725]


..


Thank you.

πŸ™‡πŸ»‍♂️


No comments:

Post a Comment