BertWordPieceTokenizer Toeknizer code
.
import os
from tokenizers import BertWordPieceTokenizer
tokenizer = BertWordPieceTokenizer(strip_accents=False, lowercase=False)
corpus_file = ['./ratings.txt'] # data path
vocab_size = 32000 #vocabμ ν¬κΈ°. λ³΄ν΅ 32,000μ΄ μ’λ€κ³ μλ €μ§.
limit_alphabet= 6000 #merge μν μ initial tokensμ΄ μ μ§λλ μ«μ μ ν
output_path = 'hugging_%d'%(vocab_size)
min_frequency = 5 # λ¨μ΄μ μ΅μ λ°μ λΉλ
hf_model_path = './'
tokenizer.train(files=corpus_file,
vocab_size=vocab_size,
min_frequency=min_frequency,
limit_alphabet=limit_alphabet,
show_progress=True)
tokenizer.save_model(hf_model_path)
..
BertWordPiece Tokenizer test
.
from transformers import BertTokenizerFast
hf_model_path = './'
tokenizer = BertTokenizerFast.from_pretrained(hf_model_path, strip_accents=False,
lowercase = False)
text = "λ€μ΄λ² μν νκ° λ¬Έμ₯μΌλ‘ ν ν¬λμ΄μ "
tokenized_input_for_pytorch = tokenizer(text, return_tensors='pt')
print("Tokens (str) : {}".format([tokenizer.convert_ids_to_tokens(s) for s in tokenized_input_for_pytorch['input_ids'].tolist()[0]]))
print("Tokens (int) : {}".format(tokenized_input_for_pytorch['input_ids'].tolist()[0]))
print("Tokens (attn_mask): {}\n".format(tokenized_input_for_pytorch['attention_mask'].tolist()[0]))
# Tokens (str) : ['[CLS]', 'λ€μ΄λ²', 'μν', 'νκ°', 'λ¬Έ', '##μ₯', '##μΌλ‘', 'ν ', '##ν¬', '##λμ΄', '##μ ', '[SEP]']
# Tokens (int) : [2, 6818, 5834, 6947, 1528, 3340, 5842, 2899, 3390, 8801, 3755, 3]
# Tokens (attn_mask): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
..
Sentence piece Tokenizer
.
import sentencepiece as spm
import os
input_file = './ratings.txt'
vocab_size = 32000
sp_model_root = 'sentencepiece'
if not os.path.isdir(sp_model_root):
os.mkdir(sp_model_root)
sp_model_name = 'tokenizer_%d' % (vocab_size)
sp_model_path = os.path.join(sp_model_root, sp_model_name)
model_type = 'unigram' #unigram, bpe
character_coverage = 1.0 #default=0.9995
user_defined_symbols = '[PAD],[UNK],[CLS],[SEP],[MASK],[BOS],[EOS],[UNK0],[UNK1],[UNK2],[UNK3],[UNK4],[UNK5],[UNK6],[UNK7],[UNK8],[UNK9],[unused0],[unused1],[unused2],[unused3],[unused4],[unused5],[unused6],[unused7],[unused8],[unused9],[unused10],[unused11],[unused12],[unused13],[unused14],[unused15],[unused16],[unused17],[unused18],[unused19],[unused20],[unused21],[unused22],[unused23],[unused24],[unused25],[unused26],[unused27],[unused28],[unused29],[unused30],[unused31],[unused32],[unused33],[unused34],[unused35],[unused36],[unused37],[unused38],[unused39],[unused40],[unused41],[unused42],[unused43],[unused44],[unused45],[unused46],[unused47],[unused48],[unused49],[unused50],[unused51],[unused52],[unused53],[unused54],[unused55],[unused56],[unused57],[unused58],[unused59],[unused60],[unused61],[unused62],[unused63],[unused64],[unused65],[unused66],[unused67],[unused68],[unused69],[unused70],[unused71],[unused72],[unused73],[unused74],[unused75],[unused76],[unused77],[unused78],[unused79],[unused80],[unused81],[unused82],[unused83],[unused84],[unused85],[unused86],[unused87],[unused88],[unused89],[unused90],[unused91],[unused92],[unused93],[unused94],[unused95],[unused96],[unused97],[unused98],[unused99]'
input_argument = '--input=%s --model_prefix=%s --vocab_size=%s --user_defined_symbols=%s --model_type=%s --character_coverage=%s'
cmd = input_argument%(input_file, sp_model_path, vocab_size,user_defined_symbols, model_type, character_coverage)
spm.SentencePieceTrainer.Train(cmd)
..
Sentence piece Tokenizer test
.
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load('{}.model'.format('./sentencepiece/tokenizer_32000'))
text = "λ€μ΄λ² μν νκ° λ¬Έμ₯μΌλ‘ ν ν¬λμ΄μ "
tokens = sp.encode_as_pieces(text)
ids = sp.encode_as_ids(text)
print("Tokens (str) : {}".format(tokens))
print("Tokens (int) : {}".format(ids))
# Tokens (str) : ['▁λ€μ΄λ²', '▁μν', '▁νκ°', '▁λ¬Έ', 'μ₯', 'μΌλ‘', '▁', 'ν ν¬', 'λμ΄', 'μ ']
# Tokens (int) : [1209, 126, 2353, 3552, 412, 166, 123, 22627, 6361, 725]
..
Thank you.
ππ»♂️
No comments:
Post a Comment