5/23/2023

Create custom tokenizer simple code.

In the sample code, vocabulary is "0,1,2,3,4" and max length is 20.


.

from typing import List, Union

class CustomTokenizer:
def __init__(self, vocab: Union[str, List[str]], pad_token="<PAD>", cls_token="<BOS>", sep_token="<SEP>", max_len=20):
if isinstance(vocab, str):
with open(vocab, 'r') as f:
self.vocab = {word.strip(): i for i, word in enumerate(f.readlines())}
elif isinstance(vocab, list):
self.vocab = {word: i for i, word in enumerate(vocab)}
else:
raise ValueError("vocab must be either a filepath (str) or a list of words")
print('vocab: ', self.vocab)
self.pad_token = pad_token
self.cls_token = cls_token
self.sep_token = sep_token
self.max_len = max_len
self.inv_vocab = {v: k for k, v in self.vocab.items()}

def tokenize(self, text: str):
tokens = [c for c in text if c in self.vocab]
tokens = tokens[:self.max_len]
padding_length = self.max_len - len(tokens)
return [self.cls_token] + tokens + [self.sep_token] + [self.pad_token] * padding_length

def convert_tokens_to_ids(self, tokens):
return [self.vocab.get(token, self.vocab.get(self.pad_token)) for token in tokens]

def convert_ids_to_tokens(self, ids):
return [self.inv_vocab.get(id, self.pad_token) for id in ids]



vocab = ["<PAD>", "<BOS>", "<SEP>", "0", "1", "2", "3", "4"]
with open('vocab.txt', 'w') as f:
for token in vocab:
f.write(token + '\n')

# Initialize your custom tokenizer
tokenizer = CustomTokenizer(vocab='vocab.txt')

# Now you can use this tokenizer to tokenize your data, study.marearts.com
tokenized_text = tokenizer.tokenize('22342')
print("tokenized_text: ", tokenized_text)

# Convert tokens to ids
token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
print("token_ids: ", token_ids)

# Convert ids back to tokens, marearts.com
tokens = tokenizer.convert_ids_to_tokens(token_ids)
print("tokens: ", tokens)

..


Thank you.

๐Ÿ™‡๐Ÿป‍♂️


No comments:

Post a Comment