MareArts Computer Vision Study.: tokenizer

Showing posts with label tokenizer. Show all posts

7/04/2023

Tokeniser example source code using "BertWordPieceTokenizer", "sentencepiece"

BertWordPieceTokenizer Toeknizer code

import os
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer(strip_accents=False, lowercase=False)

corpus_file   = ['./ratings.txt']  # data path
vocab_size    = 32000   #vocab의 크기. 보통 32,000이 좋다고 알려짐.
limit_alphabet= 6000    #merge 수행 전 initial tokens이 유지되는 숫자 제한
output_path   = 'hugging_%d'%(vocab_size)
min_frequency = 5  # 단어의 최소 발생 빈도
hf_model_path = './'

tokenizer.train(files=corpus_file,
               vocab_size=vocab_size,
               min_frequency=min_frequency,
               limit_alphabet=limit_alphabet, 
               show_progress=True)

tokenizer.save_model(hf_model_path)

BertWordPiece Tokenizer test

from transformers import BertTokenizerFast

hf_model_path = './'
tokenizer = BertTokenizerFast.from_pretrained(hf_model_path, strip_accents=False,
                                              lowercase = False)

text = "네이버 영화 평가 문장으로 토크나이저"
tokenized_input_for_pytorch = tokenizer(text, return_tensors='pt')

print("Tokens (str)      : {}".format([tokenizer.convert_ids_to_tokens(s) for s in tokenized_input_for_pytorch['input_ids'].tolist()[0]]))
print("Tokens (int)      : {}".format(tokenized_input_for_pytorch['input_ids'].tolist()[0]))
print("Tokens (attn_mask): {}\n".format(tokenized_input_for_pytorch['attention_mask'].tolist()[0]))

# Tokens (str)      : ['[CLS]', '네이버', '영화', '평가', '문', '##장', '##으로', '토', '##크', '##나이', '##저', '[SEP]']
# Tokens (int)      : [2, 6818, 5834, 6947, 1528, 3340, 5842, 2899, 3390, 8801, 3755, 3]
# Tokens (attn_mask): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Sentence piece Tokenizer

import sentencepiece as spm
import os

input_file = './ratings.txt'
vocab_size = 32000

sp_model_root = 'sentencepiece'
if not os.path.isdir(sp_model_root):
    os.mkdir(sp_model_root)
sp_model_name = 'tokenizer_%d' % (vocab_size)
sp_model_path = os.path.join(sp_model_root, sp_model_name)
model_type = 'unigram' #unigram, bpe
character_coverage = 1.0 #default=0.9995
user_defined_symbols = '[PAD],[UNK],[CLS],[SEP],[MASK],[BOS],[EOS],[UNK0],[UNK1],[UNK2],[UNK3],[UNK4],[UNK5],[UNK6],[UNK7],[UNK8],[UNK9],[unused0],[unused1],[unused2],[unused3],[unused4],[unused5],[unused6],[unused7],[unused8],[unused9],[unused10],[unused11],[unused12],[unused13],[unused14],[unused15],[unused16],[unused17],[unused18],[unused19],[unused20],[unused21],[unused22],[unused23],[unused24],[unused25],[unused26],[unused27],[unused28],[unused29],[unused30],[unused31],[unused32],[unused33],[unused34],[unused35],[unused36],[unused37],[unused38],[unused39],[unused40],[unused41],[unused42],[unused43],[unused44],[unused45],[unused46],[unused47],[unused48],[unused49],[unused50],[unused51],[unused52],[unused53],[unused54],[unused55],[unused56],[unused57],[unused58],[unused59],[unused60],[unused61],[unused62],[unused63],[unused64],[unused65],[unused66],[unused67],[unused68],[unused69],[unused70],[unused71],[unused72],[unused73],[unused74],[unused75],[unused76],[unused77],[unused78],[unused79],[unused80],[unused81],[unused82],[unused83],[unused84],[unused85],[unused86],[unused87],[unused88],[unused89],[unused90],[unused91],[unused92],[unused93],[unused94],[unused95],[unused96],[unused97],[unused98],[unused99]'

input_argument = '--input=%s --model_prefix=%s --vocab_size=%s --user_defined_symbols=%s --model_type=%s --character_coverage=%s'
cmd = input_argument%(input_file, sp_model_path, vocab_size,user_defined_symbols, model_type, character_coverage)

spm.SentencePieceTrainer.Train(cmd)

Sentence piece Tokenizer test

import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load('{}.model'.format('./sentencepiece/tokenizer_32000'))

text = "네이버 영화 평가 문장으로 토크나이저"
tokens = sp.encode_as_pieces(text)
ids = sp.encode_as_ids(text)

print("Tokens (str)      : {}".format(tokens))
print("Tokens (int)      : {}".format(ids))

# Tokens (str)      : ['▁네이버', '▁영화', '▁평가', '▁문', '장', '으로', '▁', '토크', '나이', '저']
# Tokens (int)      : [1209, 126, 2353, 3552, 412, 166, 123, 22627, 6361, 725]

Thank you.

🙇🏻‍♂️

5/23/2023

Create custom tokenizer simple code.

In the sample code, vocabulary is "0,1,2,3,4" and max length is 20.

from typing import List, Union

class CustomTokenizer:
    def __init__(self, vocab: Union[str, List[str]], pad_token="<PAD>", cls_token="<BOS>", sep_token="<SEP>", max_len=20):
        if isinstance(vocab, str):
            with open(vocab, 'r') as f:
                self.vocab = {word.strip(): i for i, word in enumerate(f.readlines())}
        elif isinstance(vocab, list):
            self.vocab = {word: i for i, word in enumerate(vocab)}
        else:
            raise ValueError("vocab must be either a filepath (str) or a list of words")
        
        print('vocab: ', self.vocab)
        self.pad_token = pad_token
        self.cls_token = cls_token
        self.sep_token = sep_token
        self.max_len = max_len
        self.inv_vocab = {v: k for k, v in self.vocab.items()}

    def tokenize(self, text: str):
        tokens = [c for c in text if c in self.vocab]
        tokens = tokens[:self.max_len]
        padding_length = self.max_len - len(tokens)
        return [self.cls_token] + tokens + [self.sep_token] + [self.pad_token] * padding_length

    def convert_tokens_to_ids(self, tokens):
        return [self.vocab.get(token, self.vocab.get(self.pad_token)) for token in tokens]

    def convert_ids_to_tokens(self, ids):
        return [self.inv_vocab.get(id, self.pad_token) for id in ids]



vocab = ["<PAD>", "<BOS>", "<SEP>", "0", "1", "2", "3", "4"]
with open('vocab.txt', 'w') as f:
    for token in vocab:
        f.write(token + '\n')

# Initialize your custom tokenizer
tokenizer = CustomTokenizer(vocab='vocab.txt')

# Now you can use this tokenizer to tokenize your data, study.marearts.com
tokenized_text = tokenizer.tokenize('22342')
print("tokenized_text: ", tokenized_text)

# Convert tokens to ids
token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
print("token_ids: ", token_ids)

# Convert ids back to tokens, marearts.com
tokens = tokenizer.convert_ids_to_tokens(token_ids)
print("tokens: ", tokens)

Thank you.

🙇🏻‍♂️

2/17/2023

Efficiently Converting LayoutLMv3 OCR Model Output Logits with Strides and Split Tokens

refer to code:

import torch
from transformers import LayoutLMv3ForTokenClassification, LayoutLMv3Tokenizer

# Load the pre-trained model and its tokenizer
model_name = "microsoft/layoutlmv3-base"
tokenizer = LayoutLMv3Tokenizer.from_pretrained(model_name)
model = LayoutLMv3ForTokenClassification.from_pretrained(model_name)

# Load the image and preprocess it using the tokenizer
image_path = 'example.png'
inputs = tokenizer(image_path, return_tensors="pt")

# Pass the preprocessed inputs through the model and obtain the output logits
outputs = model.ocr(inputs.pixel_values, boxes=inputs.bbox, attention_mask=inputs.attention_mask)

# Get the list of input token ids and corresponding bounding boxes
input_tokens = inputs.input_ids[0]
input_boxes = inputs.bbox[0]

# Calculate the corresponding bounding boxes for the downsampled tokens
downsampled_boxes = []
for i in range(len(input_tokens)):
    token_box = input_boxes[i]
    stride = inputs.strides[0][i]
    downsampled_box = [coord // stride for coord in token_box]
    downsampled_boxes.append(downsampled_box)

# Get the output logits corresponding to the input tokens
valid_tokens = inputs.attention_mask[0].nonzero(as_tuple=False)
valid_logits = outputs.logits[0, valid_tokens[:, 1]]

# Concatenate the output logits for all batches in the correct order
if len(outputs.logits.shape) == 3:
    batch_logits = torch.cat([batch_logits for batch_logits in outputs.logits], dim=0)
else:
    batch_logits = outputs.logits

# Convert the selected output logits to probabilities and assign each word the label of the class with the highest probability
word_labels = []
for i, (token_id, token_box) in enumerate(zip(input_tokens, downsampled_boxes)):
    token_logits = batch_logits[i]
    token_probs = torch.softmax(token_logits, dim=0)
    word_prob, word_label = token_probs.max(dim=0)
    word_labels.append(word_label.item())

To convert the output logits to a word-level format, you will need to perform several steps:

Obtain the list of token ids and corresponding bounding boxes from the input that was passed to the LayoutLMv3 model.
Use the strides parameter to obtain the corresponding bounding boxes for the downsampled tokens.
Use the attention_mask tensor to identify which output tokens correspond to valid input tokens, and select the corresponding output logits.
If the input sequence was split into multiple batches, concatenate the output logits for each batch in the correct order.
Convert the selected output logits to probabilities using the softmax function.
Assign each word the label corresponding to the class with the highest probability for its tokens.

Thank you.

🙇🏻‍♂️

www.marearts.com

2/07/2023

Tokenizer token grouping, token entity grouping

refer to below example code:

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


words = ["on", "a", "reported", "basis", "and", "10.4%", "on", "a", "like-for-like", "basis."]
sentence = ' '.join(words)

tokens = tokenizer(sentence, return_tensors="np", max_length=128, padding='max_length')

print('origin tokenizer result -------')
print(f'words({len(words)}), {words}')
print(f'sentence: {sentence}')
print(f'tokens ({ len(tokens["input_ids"][0]) }) : {tokens}')

print('grouping tokens -------')
word_tokens_list = {x : tokenizer.encode(x, add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False) for x in words}
print('word_tokens_list: ', word_tokens_list)

idx_for_words =[tokenizer.encode(x, add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False) for x in words]
print(f'idx_for_words ({len(idx_for_words)}): ',idx_for_words)

desired_output = []
idx = 0
for token in idx_for_words:
    tokenoutput = []
    for ids in token:
        tokenoutput.append(idx)
        idx +=1
    desired_output.append(tokenoutput)

print('tokens in grouped list')
print(desired_output)

output

origin tokenizer result -------
words(10), ['on', 'a', 'reported', 'basis', 'and', '10.4%', 'on', 'a', 'like-for-like', 'basis.']
sentence: on a reported basis and 10.4% on a like-for-like basis.
tokens (128) 
input_ids:  [[ 101 1113  170 2103 3142 1105 1275  119  125  110 1113  170 1176  118
  1111  118 1176 3142  119  102    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]]
token_type_ids:  [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
attention_mask:  [[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
grouping tokens -------
word_tokens_list:  {'on': [1113], 'a': [170], 'reported': [2103], 'basis': [3142], 'and': [1105], '10.4%': [1275, 119, 125, 110], 'like-for-like': [1176, 118, 1111, 118, 1176], 'basis.': [3142, 119]}
idx_for_words (10):  [[1113], [170], [2103], [3142], [1105], [1275, 119, 125, 110], [1113], [170], [1176, 118, 1111, 118, 1176], [3142, 119]]
tokens in grouped list
[[0], [1], [2], [3], [4], [5, 6, 7, 8], [9], [10], [11, 12, 13, 14, 15], [16, 17]]

Thank you.

🙇🏻‍♂️

www.marearts.com

Pages

7/04/2023

Tokeniser example source code using "BertWordPieceTokenizer", "sentencepiece"

5/23/2023

Create custom tokenizer simple code.

2/17/2023

Efficiently Converting LayoutLMv3 OCR Model Output Logits with Strides and Split Tokens

2/07/2023

Tokenizer token grouping, token entity grouping