Showing posts with label NLP. Show all posts
Showing posts with label NLP. Show all posts

7/13/2023

Beam search function for image to text or nlp inference purpose.

  refer to code first.

.

#this beam search only deal with batch size 1
def beam_search(self, pixel_value, max_length):
beam_size = self.cfg.num_beams
alpha = self.cfg.beam_alpha # Length normalization coefficient
temperature = self.cfg.beam_temp # Temperature for softmax

# Initialize input ids as bos_token_id
first_sequence = torch.full((pixel_value.shape[0], 1), self.model.config.decoder_start_token_id).to(pixel_value.device)
# ic(first_sequence) #tensor([[1]])

# Predict second token id
outputs = self.forward_pass(pixel_value, first_sequence)
# ic(outputs.keys()) #dict_keys(['logits', 'loss'])
# We only need the logits corresponding to the last prediction
next_token_logits = outputs['logits'][:, -1, :]
# ic(outputs['logits'].shape) #[1, 1, 13] batch, seq, vocab_size
# ic(outputs['logits'][:, -1, :].shape) #[1, 13] batch, vocab_size

# Apply temperature
# ic(next_token_logits)
# [-5.0641, 32.7805, -2.6743, -4.6459, 0.8130, -1.3443, -1.2016, -4.0770,
# -3.5401, 0.2425, -5.3685, -1.8074, -5.2606]],
# next_token_logits /= temperature
# ic(next_token_logits)
# [-7.2344, 46.8292, -3.8204, -6.6370, 1.1614, -1.9205, -1.7166, -5.8243,
# -5.0573, 0.3464, -7.6693, -2.5820, -7.5152]],

# Select top k tokens
next_token_probs = F.softmax(next_token_logits, dim=-1)
top_k_probs, top_k_ids = torch.topk(next_token_probs, beam_size)
# ic(F.softmax(next_token_logits, dim=-1))
# tensor([[3.3148e-24, 1.0000e+00, 1.0072e-22, 6.0241e-24, 1.4680e-20, 6.7340e-22,
# 8.2570e-22, 1.3579e-23, 2.9239e-23, 6.4976e-21, 2.1458e-24, 3.4751e-22,
# 2.5034e-24]]
# ic(top_k_probs, top_k_ids)
# top_k_probs: tensor([[1.]], grad_fn=<TopkBackward0>)
# top_k_ids: tensor([[1]])

# Prepare next sequences. Each top 1 token is appended to the first_sequence
# ic(first_sequence.shape) #[1, 1]
next_sequences = first_sequence.repeat_interleave(beam_size, dim=0)
# ic(next_sequences.shape) #[10, 1] 10 is beam size, 1 is seq length
next_sequences = torch.cat([next_sequences, top_k_ids.view(-1, 1)], dim=-1)
# ic(next_sequences.shape) #[10, 2] 10 is beam size, 2 is seq length
# ic(next_sequences)

# Also prepare a tensor to hold the cumulative scores of each sequence, or the sum of the log probabilities of each token in the sequence
sequence_scores = (torch.log(top_k_probs).view(-1)) #/ (1 + 1) ** alpha
# ic(sequence_scores) #[ 0.0000, -15.9837]

# We'll need to repeat the pixel_values for each sequence in each beam
pixel_value = pixel_value.repeat_interleave(beam_size, dim=0)
# ic(pixel_value.shape) #[10, 3, 224, 224], 10 is beam size, 3 is channel, 224 is image size

for idx in range(max_length - 1): # We already generated one token
# ic(idx, '--------------------')
outputs = self.forward_pass(pixel_value, next_sequences)
next_token_logits = outputs['logits'][:, -1, :]
# ic(outputs['logits'].shape, outputs['logits']) #[2, 2, 13], batch, seq, vocab_size
# ic(next_token_logits.shape, next_token_logits)

# Apply temperature
# next_token_logits /= temperature

# Convert logits to probabilities and calculate new scores
next_token_probs = F.softmax(next_token_logits, dim=-1)
# ic(next_token_probs.shape, next_token_probs) #[2, 13], batch, vocab_size
next_token_scores = torch.log(next_token_probs)
# ic(next_token_scores.shape, next_token_scores) #[2, 13], batch, vocab_size

new_scores = sequence_scores.unsqueeze(1) + next_token_scores
# ic(sequence_scores.unsqueeze(1))
# ic(new_scores.shape, new_scores) #[2, 13], batch, vocab_size

# Select top k sequences
# ic(new_scores.view(-1), new_scores.view(-1).shape)
top_k_scores, top_k_indices = torch.topk(new_scores.view(-1), beam_size)

# ic(top_k_scores, top_k_indices)

# Get the beam and token that each of the top k sequences comes from
beams_indices = top_k_indices // self.cfg.num_tokens
token_indices = top_k_indices % self.cfg.num_tokens
# ic(beams_indices, token_indices)

# Update pixel values, sequences, and scores
# pixel_value = pixel_value[beams_indices]
# ic(next_sequences)
next_sequences = next_sequences[beams_indices]
# ic(next_sequences)
next_sequences = torch.cat([next_sequences, token_indices.unsqueeze(1)], dim=-1)
# ic(next_sequences)
sequence_scores = top_k_scores #/ (idx + 3) ** alpha

# ic('-------------------')
# if idx > 2: break

# Select the best sequence
max_score, max_score_idx = torch.max(sequence_scores, 0)
# Select the sequence with the highest score
best_sequence = next_sequences[max_score_idx]

# ic(best_sequence, max_score)
return best_sequence, max_score

..


This is portion of my class. 

There are omitted code especially forward_pass however the code will work properly if you adapt this carefully. 

And you can also capture some idea from here.

Thank you.

πŸ™‡πŸ»‍♂️

www.marearts.com


7/04/2023

CrossEntropyLoss example code using the input which similar with nlp token.

 Refer to code

.

import torch
import torch.nn as nn

# Assume a batch size of 2 and a sequence length of 3, and the model's vocabulary size is 5.
# So, your predicted logits would have a shape of (batch size, sequence length, vocab size)

logits = torch.tensor([
[[0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1], [0.1, 0.2, 0.3, 0.4, 0.5]],
[[0.5, 0.4, 0.3, 0.2, 0.1], [0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1]]
])
logits = logits.view(-1, logits.shape[-1]) # Reshape logits to be 2D (N, C), where N is batch_size*seq_length, C is vocab_size

# Similarly, your labels would have a shape of (batch size, sequence length).
# These are example labels.

labels = torch.tensor([
[0, 1, 2],
[2, 1, 0]
])
labels = labels.view(-1) # Reshape labels to be 1D (N)

loss_function = nn.CrossEntropyLoss() # Initialize loss function
loss = loss_function(logits, labels) # Compute the loss

print(loss) # Print the loss

..




In this example, logits and labels are explicitly defined tensors. The values in logits represent the output from your model for each token in the sequence for each example in your batch, and the labels tensor represents the correct labels or classes for each of these tokens. nn.CrossEntropyLoss() is then used to compute the loss between the predicted logits and the actual labels.




Thank you.

πŸ™‡πŸ»‍♂️

Tokeniser example source code using "BertWordPieceTokenizer", "sentencepiece"


BertWordPieceTokenizer Toeknizer code

.

import os
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer(strip_accents=False, lowercase=False)

corpus_file = ['./ratings.txt'] # data path
vocab_size = 32000 #vocab의 크기. 보톡 32,000이 μ’‹λ‹€κ³  μ•Œλ €μ§.
limit_alphabet= 6000 #merge μˆ˜ν–‰ μ „ initial tokens이 μœ μ§€λ˜λŠ” 숫자 μ œν•œ
output_path = 'hugging_%d'%(vocab_size)
min_frequency = 5 # λ‹¨μ–΄μ˜ μ΅œμ†Œ λ°œμƒ λΉˆλ„
hf_model_path = './'

tokenizer.train(files=corpus_file,
vocab_size=vocab_size,
min_frequency=min_frequency,
limit_alphabet=limit_alphabet,
show_progress=True)

tokenizer.save_model(hf_model_path)

..


BertWordPiece Tokenizer test

.

from transformers import BertTokenizerFast

hf_model_path = './'
tokenizer = BertTokenizerFast.from_pretrained(hf_model_path, strip_accents=False,
lowercase = False)

text = "넀이버 μ˜ν™” 평가 λ¬Έμž₯으둜 ν† ν¬λ‚˜μ΄μ €"
tokenized_input_for_pytorch = tokenizer(text, return_tensors='pt')

print("Tokens (str) : {}".format([tokenizer.convert_ids_to_tokens(s) for s in tokenized_input_for_pytorch['input_ids'].tolist()[0]]))
print("Tokens (int) : {}".format(tokenized_input_for_pytorch['input_ids'].tolist()[0]))
print("Tokens (attn_mask): {}\n".format(tokenized_input_for_pytorch['attention_mask'].tolist()[0]))

# Tokens (str) : ['[CLS]', '넀이버', 'μ˜ν™”', '평가', 'λ¬Έ', '##μž₯', '##으둜', 'ν† ', '##크', '##λ‚˜μ΄', '##μ €', '[SEP]']
# Tokens (int) : [2, 6818, 5834, 6947, 1528, 3340, 5842, 2899, 3390, 8801, 3755, 3]
# Tokens (attn_mask): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

..


Sentence piece Tokenizer

.

import sentencepiece as spm
import os

input_file = './ratings.txt'
vocab_size = 32000

sp_model_root = 'sentencepiece'
if not os.path.isdir(sp_model_root):
os.mkdir(sp_model_root)
sp_model_name = 'tokenizer_%d' % (vocab_size)
sp_model_path = os.path.join(sp_model_root, sp_model_name)
model_type = 'unigram' #unigram, bpe
character_coverage = 1.0 #default=0.9995
user_defined_symbols = '[PAD],[UNK],[CLS],[SEP],[MASK],[BOS],[EOS],[UNK0],[UNK1],[UNK2],[UNK3],[UNK4],[UNK5],[UNK6],[UNK7],[UNK8],[UNK9],[unused0],[unused1],[unused2],[unused3],[unused4],[unused5],[unused6],[unused7],[unused8],[unused9],[unused10],[unused11],[unused12],[unused13],[unused14],[unused15],[unused16],[unused17],[unused18],[unused19],[unused20],[unused21],[unused22],[unused23],[unused24],[unused25],[unused26],[unused27],[unused28],[unused29],[unused30],[unused31],[unused32],[unused33],[unused34],[unused35],[unused36],[unused37],[unused38],[unused39],[unused40],[unused41],[unused42],[unused43],[unused44],[unused45],[unused46],[unused47],[unused48],[unused49],[unused50],[unused51],[unused52],[unused53],[unused54],[unused55],[unused56],[unused57],[unused58],[unused59],[unused60],[unused61],[unused62],[unused63],[unused64],[unused65],[unused66],[unused67],[unused68],[unused69],[unused70],[unused71],[unused72],[unused73],[unused74],[unused75],[unused76],[unused77],[unused78],[unused79],[unused80],[unused81],[unused82],[unused83],[unused84],[unused85],[unused86],[unused87],[unused88],[unused89],[unused90],[unused91],[unused92],[unused93],[unused94],[unused95],[unused96],[unused97],[unused98],[unused99]'

input_argument = '--input=%s --model_prefix=%s --vocab_size=%s --user_defined_symbols=%s --model_type=%s --character_coverage=%s'
cmd = input_argument%(input_file, sp_model_path, vocab_size,user_defined_symbols, model_type, character_coverage)

spm.SentencePieceTrainer.Train(cmd)

..


Sentence piece Tokenizer test

.

import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load('{}.model'.format('./sentencepiece/tokenizer_32000'))

text = "넀이버 μ˜ν™” 평가 λ¬Έμž₯으둜 ν† ν¬λ‚˜μ΄μ €"
tokens = sp.encode_as_pieces(text)
ids = sp.encode_as_ids(text)

print("Tokens (str) : {}".format(tokens))
print("Tokens (int) : {}".format(ids))

# Tokens (str) : ['▁넀이버', '▁μ˜ν™”', '▁평가', '▁λ¬Έ', 'μž₯', '으둜', '▁', '토크', 'λ‚˜μ΄', 'μ €']
# Tokens (int) : [1209, 126, 2353, 3552, 412, 166, 123, 22627, 6361, 725]


..


Thank you.

πŸ™‡πŸ»‍♂️


6/29/2023

text summarise dataset

**Paper:**

https://arxiv.org/abs/1908.08345


**Dataset:**

1) the CNN/DailyMail news highlights dataset: somewhat Extractive

- News Articles & Related Highlights: Provides a brief overview of articles

- Input document: limited to 512 tokens

- https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail


2) the New York Times Annotated Corpus (NYT): somewhat Extractive

- Contains 110,540 articles with abstract summaries

- Input document : limited to 800 tokens

- https://research.google/resources/datasets/ny-times-annotated-corpus/


3) XSum: Abstractive

- 226,711 news articles answering the question of ‘What is this articles about?’ + one-sentence summaries

- Input document: limited to 512 tokens

- https://github.com/google-research-datasets/xsum_hallucination_annotations

5/23/2023

Create custom tokenizer simple code.

In the sample code, vocabulary is "0,1,2,3,4" and max length is 20.


.

from typing import List, Union

class CustomTokenizer:
def __init__(self, vocab: Union[str, List[str]], pad_token="<PAD>", cls_token="<BOS>", sep_token="<SEP>", max_len=20):
if isinstance(vocab, str):
with open(vocab, 'r') as f:
self.vocab = {word.strip(): i for i, word in enumerate(f.readlines())}
elif isinstance(vocab, list):
self.vocab = {word: i for i, word in enumerate(vocab)}
else:
raise ValueError("vocab must be either a filepath (str) or a list of words")
print('vocab: ', self.vocab)
self.pad_token = pad_token
self.cls_token = cls_token
self.sep_token = sep_token
self.max_len = max_len
self.inv_vocab = {v: k for k, v in self.vocab.items()}

def tokenize(self, text: str):
tokens = [c for c in text if c in self.vocab]
tokens = tokens[:self.max_len]
padding_length = self.max_len - len(tokens)
return [self.cls_token] + tokens + [self.sep_token] + [self.pad_token] * padding_length

def convert_tokens_to_ids(self, tokens):
return [self.vocab.get(token, self.vocab.get(self.pad_token)) for token in tokens]

def convert_ids_to_tokens(self, ids):
return [self.inv_vocab.get(id, self.pad_token) for id in ids]



vocab = ["<PAD>", "<BOS>", "<SEP>", "0", "1", "2", "3", "4"]
with open('vocab.txt', 'w') as f:
for token in vocab:
f.write(token + '\n')

# Initialize your custom tokenizer
tokenizer = CustomTokenizer(vocab='vocab.txt')

# Now you can use this tokenizer to tokenize your data, study.marearts.com
tokenized_text = tokenizer.tokenize('22342')
print("tokenized_text: ", tokenized_text)

# Convert tokens to ids
token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
print("token_ids: ", token_ids)

# Convert ids back to tokens, marearts.com
tokens = tokenizer.convert_ids_to_tokens(token_ids)
print("tokens: ", tokens)

..


Thank you.

πŸ™‡πŸ»‍♂️


5/13/2022

convert simple transformer ner model to onnx

 

..

!python -m transformers.onnx --model=./checkpoint-21-epoch-11 --feature=token-classification onnx/

..



tokens to word, transformer

 

Refer to code to figure it out

how tokens consisted for a word.

Code show you tokens list for a word.


..

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

example = "This is a tokenization example"

print('input sentence: ', example)
print('---')
print('tokens :')
print( tokenizer.encode(example, add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False) )
print('---')
print('word and tokens :')
print({x : tokenizer.encode(x, add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False) for x in example.split()})
print('---')
idx = 1
enc =[tokenizer.encode(x, add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False) for x in example.split()]
desired_output = []
for token in enc:
tokenoutput = []
for ids in token:
tokenoutput.append(idx)
idx +=1
desired_output.append(tokenoutput)

print('tokens in grouped list')
print(desired_output)
print('---')

..


input sentence:  This is a tokenization example
---
tokens :
[713, 16, 10, 19233, 1938, 1246]
---
word and tokens :
{'This': [713], 'is': [354], 'a': [102], 'tokenization': [46657, 1938], 'example': [46781]}
---
tokens in grouped list
[[1], [2], [3], [4, 5], [6]]
---


Thank you.
www.marearts.com

5/09/2022

BERT Tokenizer, string to token, token to string

 

BERT Tokenizer token understanding examples

..

text = "I am e/mail"
# text = "I am a e-mail"
tokens = tokenizer.tokenize(text)
print(f'Tokens: {tokens}')
print(f'Tokens length: {len(tokens)}')
encoding = tokenizer.encode(text)
print(f'Encoding: {encoding}')
print(f'Encoding length: {len(encoding)}')
tok_text = tokenizer.convert_tokens_to_string(tokens)
print(f'token to string: {tok_text}')

..

output:

Tokens: ['I', 'Δ am', 'Δ e', '/', 'mail']
Tokens length: 5
Encoding: [0, 100, 524, 364, 73, 6380, 2]
Encoding length: 7
token to string: I am e/mail

--
Thank you.
www.marearts.com