Showing posts with label tokenizer. Show all posts
Showing posts with label tokenizer. Show all posts

7/04/2023

Tokeniser example source code using "BertWordPieceTokenizer", "sentencepiece"


BertWordPieceTokenizer Toeknizer code

.

import os
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer(strip_accents=False, lowercase=False)

corpus_file = ['./ratings.txt'] # data path
vocab_size = 32000 #vocab의 크기. 보톡 32,000이 μ’‹λ‹€κ³  μ•Œλ €μ§.
limit_alphabet= 6000 #merge μˆ˜ν–‰ μ „ initial tokens이 μœ μ§€λ˜λŠ” 숫자 μ œν•œ
output_path = 'hugging_%d'%(vocab_size)
min_frequency = 5 # λ‹¨μ–΄μ˜ μ΅œμ†Œ λ°œμƒ λΉˆλ„
hf_model_path = './'

tokenizer.train(files=corpus_file,
vocab_size=vocab_size,
min_frequency=min_frequency,
limit_alphabet=limit_alphabet,
show_progress=True)

tokenizer.save_model(hf_model_path)

..


BertWordPiece Tokenizer test

.

from transformers import BertTokenizerFast

hf_model_path = './'
tokenizer = BertTokenizerFast.from_pretrained(hf_model_path, strip_accents=False,
lowercase = False)

text = "넀이버 μ˜ν™” 평가 λ¬Έμž₯으둜 ν† ν¬λ‚˜μ΄μ €"
tokenized_input_for_pytorch = tokenizer(text, return_tensors='pt')

print("Tokens (str) : {}".format([tokenizer.convert_ids_to_tokens(s) for s in tokenized_input_for_pytorch['input_ids'].tolist()[0]]))
print("Tokens (int) : {}".format(tokenized_input_for_pytorch['input_ids'].tolist()[0]))
print("Tokens (attn_mask): {}\n".format(tokenized_input_for_pytorch['attention_mask'].tolist()[0]))

# Tokens (str) : ['[CLS]', '넀이버', 'μ˜ν™”', '평가', 'λ¬Έ', '##μž₯', '##으둜', 'ν† ', '##크', '##λ‚˜μ΄', '##μ €', '[SEP]']
# Tokens (int) : [2, 6818, 5834, 6947, 1528, 3340, 5842, 2899, 3390, 8801, 3755, 3]
# Tokens (attn_mask): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

..


Sentence piece Tokenizer

.

import sentencepiece as spm
import os

input_file = './ratings.txt'
vocab_size = 32000

sp_model_root = 'sentencepiece'
if not os.path.isdir(sp_model_root):
os.mkdir(sp_model_root)
sp_model_name = 'tokenizer_%d' % (vocab_size)
sp_model_path = os.path.join(sp_model_root, sp_model_name)
model_type = 'unigram' #unigram, bpe
character_coverage = 1.0 #default=0.9995
user_defined_symbols = '[PAD],[UNK],[CLS],[SEP],[MASK],[BOS],[EOS],[UNK0],[UNK1],[UNK2],[UNK3],[UNK4],[UNK5],[UNK6],[UNK7],[UNK8],[UNK9],[unused0],[unused1],[unused2],[unused3],[unused4],[unused5],[unused6],[unused7],[unused8],[unused9],[unused10],[unused11],[unused12],[unused13],[unused14],[unused15],[unused16],[unused17],[unused18],[unused19],[unused20],[unused21],[unused22],[unused23],[unused24],[unused25],[unused26],[unused27],[unused28],[unused29],[unused30],[unused31],[unused32],[unused33],[unused34],[unused35],[unused36],[unused37],[unused38],[unused39],[unused40],[unused41],[unused42],[unused43],[unused44],[unused45],[unused46],[unused47],[unused48],[unused49],[unused50],[unused51],[unused52],[unused53],[unused54],[unused55],[unused56],[unused57],[unused58],[unused59],[unused60],[unused61],[unused62],[unused63],[unused64],[unused65],[unused66],[unused67],[unused68],[unused69],[unused70],[unused71],[unused72],[unused73],[unused74],[unused75],[unused76],[unused77],[unused78],[unused79],[unused80],[unused81],[unused82],[unused83],[unused84],[unused85],[unused86],[unused87],[unused88],[unused89],[unused90],[unused91],[unused92],[unused93],[unused94],[unused95],[unused96],[unused97],[unused98],[unused99]'

input_argument = '--input=%s --model_prefix=%s --vocab_size=%s --user_defined_symbols=%s --model_type=%s --character_coverage=%s'
cmd = input_argument%(input_file, sp_model_path, vocab_size,user_defined_symbols, model_type, character_coverage)

spm.SentencePieceTrainer.Train(cmd)

..


Sentence piece Tokenizer test

.

import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load('{}.model'.format('./sentencepiece/tokenizer_32000'))

text = "넀이버 μ˜ν™” 평가 λ¬Έμž₯으둜 ν† ν¬λ‚˜μ΄μ €"
tokens = sp.encode_as_pieces(text)
ids = sp.encode_as_ids(text)

print("Tokens (str) : {}".format(tokens))
print("Tokens (int) : {}".format(ids))

# Tokens (str) : ['▁넀이버', '▁μ˜ν™”', '▁평가', '▁λ¬Έ', 'μž₯', '으둜', '▁', '토크', 'λ‚˜μ΄', 'μ €']
# Tokens (int) : [1209, 126, 2353, 3552, 412, 166, 123, 22627, 6361, 725]


..


Thank you.

πŸ™‡πŸ»‍♂️


5/23/2023

Create custom tokenizer simple code.

In the sample code, vocabulary is "0,1,2,3,4" and max length is 20.


.

from typing import List, Union

class CustomTokenizer:
def __init__(self, vocab: Union[str, List[str]], pad_token="<PAD>", cls_token="<BOS>", sep_token="<SEP>", max_len=20):
if isinstance(vocab, str):
with open(vocab, 'r') as f:
self.vocab = {word.strip(): i for i, word in enumerate(f.readlines())}
elif isinstance(vocab, list):
self.vocab = {word: i for i, word in enumerate(vocab)}
else:
raise ValueError("vocab must be either a filepath (str) or a list of words")
print('vocab: ', self.vocab)
self.pad_token = pad_token
self.cls_token = cls_token
self.sep_token = sep_token
self.max_len = max_len
self.inv_vocab = {v: k for k, v in self.vocab.items()}

def tokenize(self, text: str):
tokens = [c for c in text if c in self.vocab]
tokens = tokens[:self.max_len]
padding_length = self.max_len - len(tokens)
return [self.cls_token] + tokens + [self.sep_token] + [self.pad_token] * padding_length

def convert_tokens_to_ids(self, tokens):
return [self.vocab.get(token, self.vocab.get(self.pad_token)) for token in tokens]

def convert_ids_to_tokens(self, ids):
return [self.inv_vocab.get(id, self.pad_token) for id in ids]



vocab = ["<PAD>", "<BOS>", "<SEP>", "0", "1", "2", "3", "4"]
with open('vocab.txt', 'w') as f:
for token in vocab:
f.write(token + '\n')

# Initialize your custom tokenizer
tokenizer = CustomTokenizer(vocab='vocab.txt')

# Now you can use this tokenizer to tokenize your data, study.marearts.com
tokenized_text = tokenizer.tokenize('22342')
print("tokenized_text: ", tokenized_text)

# Convert tokens to ids
token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
print("token_ids: ", token_ids)

# Convert ids back to tokens, marearts.com
tokens = tokenizer.convert_ids_to_tokens(token_ids)
print("tokens: ", tokens)

..


Thank you.

πŸ™‡πŸ»‍♂️


2/17/2023

Efficiently Converting LayoutLMv3 OCR Model Output Logits with Strides and Split Tokens

 refer to code:



..

import torch
from transformers import LayoutLMv3ForTokenClassification, LayoutLMv3Tokenizer

# Load the pre-trained model and its tokenizer
model_name = "microsoft/layoutlmv3-base"
tokenizer = LayoutLMv3Tokenizer.from_pretrained(model_name)
model = LayoutLMv3ForTokenClassification.from_pretrained(model_name)

# Load the image and preprocess it using the tokenizer
image_path = 'example.png'
inputs = tokenizer(image_path, return_tensors="pt")

# Pass the preprocessed inputs through the model and obtain the output logits
outputs = model.ocr(inputs.pixel_values, boxes=inputs.bbox, attention_mask=inputs.attention_mask)

# Get the list of input token ids and corresponding bounding boxes
input_tokens = inputs.input_ids[0]
input_boxes = inputs.bbox[0]

# Calculate the corresponding bounding boxes for the downsampled tokens
downsampled_boxes = []
for i in range(len(input_tokens)):
token_box = input_boxes[i]
stride = inputs.strides[0][i]
downsampled_box = [coord // stride for coord in token_box]
downsampled_boxes.append(downsampled_box)

# Get the output logits corresponding to the input tokens
valid_tokens = inputs.attention_mask[0].nonzero(as_tuple=False)
valid_logits = outputs.logits[0, valid_tokens[:, 1]]

# Concatenate the output logits for all batches in the correct order
if len(outputs.logits.shape) == 3:
batch_logits = torch.cat([batch_logits for batch_logits in outputs.logits], dim=0)
else:
batch_logits = outputs.logits

# Convert the selected output logits to probabilities and assign each word the label of the class with the highest probability
word_labels = []
for i, (token_id, token_box) in enumerate(zip(input_tokens, downsampled_boxes)):
token_logits = batch_logits[i]
token_probs = torch.softmax(token_logits, dim=0)
word_prob, word_label = token_probs.max(dim=0)
word_labels.append(word_label.item())

..


To convert the output logits to a word-level format, you will need to perform several steps:

  1. Obtain the list of token ids and corresponding bounding boxes from the input that was passed to the LayoutLMv3 model.

  2. Use the strides parameter to obtain the corresponding bounding boxes for the downsampled tokens.

  3. Use the attention_mask tensor to identify which output tokens correspond to valid input tokens, and select the corresponding output logits.

  4. If the input sequence was split into multiple batches, concatenate the output logits for each batch in the correct order.

  5. Convert the selected output logits to probabilities using the softmax function.

  6. Assign each word the label corresponding to the class with the highest probability for its tokens.



Thank you.

πŸ™‡πŸ»‍♂️ 

www.marearts.com

2/07/2023

Tokenizer token grouping, token entity grouping

 refer to below example code:


..

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


words = ["on", "a", "reported", "basis", "and", "10.4%", "on", "a", "like-for-like", "basis."]
sentence = ' '.join(words)

tokens = tokenizer(sentence, return_tensors="np", max_length=128, padding='max_length')

print('origin tokenizer result -------')
print(f'words({len(words)}), {words}')
print(f'sentence: {sentence}')
print(f'tokens ({ len(tokens["input_ids"][0]) }) : {tokens}')

print('grouping tokens -------')
word_tokens_list = {x : tokenizer.encode(x, add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False) for x in words}
print('word_tokens_list: ', word_tokens_list)

idx_for_words =[tokenizer.encode(x, add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False) for x in words]
print(f'idx_for_words ({len(idx_for_words)}): ',idx_for_words)

desired_output = []
idx = 0
for token in idx_for_words:
tokenoutput = []
for ids in token:
tokenoutput.append(idx)
idx +=1
desired_output.append(tokenoutput)

print('tokens in grouped list')
print(desired_output)

..



output

..

origin tokenizer result -------
words(10), ['on', 'a', 'reported', 'basis', 'and', '10.4%', 'on', 'a', 'like-for-like', 'basis.']
sentence: on a reported basis and 10.4% on a like-for-like basis.
tokens (128)
input_ids: [[ 101 1113 170 2103 3142 1105 1275 119 125 110 1113 170 1176 118
1111 118 1176 3142 119 102 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0]]
token_type_ids: [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
attention_mask: [[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
grouping tokens -------
word_tokens_list: {'on': [1113], 'a': [170], 'reported': [2103], 'basis': [3142], 'and': [1105], '10.4%': [1275, 119, 125, 110], 'like-for-like': [1176, 118, 1111, 118, 1176], 'basis.': [3142, 119]}
idx_for_words (10): [[1113], [170], [2103], [3142], [1105], [1275, 119, 125, 110], [1113], [170], [1176, 118, 1111, 118, 1176], [3142, 119]]
tokens in grouped list
[[0], [1], [2], [3], [4], [5, 6, 7, 8], [9], [10], [11, 12, 13, 14, 15], [16, 17]]

..


Thank you.

πŸ™‡πŸ»‍♂️

www.marearts.com