MareArts Computer Vision Study.: NLP

Showing posts with label NLP. Show all posts

7/13/2023

Beam search function for image to text or nlp inference purpose.

refer to code first.

#this beam search only deal with batch size 1
    def beam_search(self, pixel_value, max_length):
        beam_size = self.cfg.num_beams
        alpha = self.cfg.beam_alpha  # Length normalization coefficient
        temperature = self.cfg.beam_temp  # Temperature for softmax

        # Initialize input ids as bos_token_id
        first_sequence = torch.full((pixel_value.shape[0], 1), self.model.config.decoder_start_token_id).to(pixel_value.device)
        # ic(first_sequence) #tensor([[1]])

        # Predict second token id
        outputs = self.forward_pass(pixel_value, first_sequence)
        # ic(outputs.keys()) #dict_keys(['logits', 'loss'])
        # We only need the logits corresponding to the last prediction
        next_token_logits = outputs['logits'][:, -1, :]  
        # ic(outputs['logits'].shape) #[1, 1, 13] batch, seq, vocab_size
        # ic(outputs['logits'][:, -1, :].shape) #[1, 13] batch, vocab_size

        # Apply temperature
        # ic(next_token_logits) 
        # [-5.0641, 32.7805, -2.6743, -4.6459,  0.8130, -1.3443, -1.2016, -4.0770,
        #                         -3.5401,  0.2425, -5.3685, -1.8074, -5.2606]],
        # next_token_logits /= temperature
        # ic(next_token_logits) 
        # [-7.2344, 46.8292, -3.8204, -6.6370,  1.1614, -1.9205, -1.7166, -5.8243,
        #                         -5.0573,  0.3464, -7.6693, -2.5820, -7.5152]],

        # Select top k tokens
        next_token_probs = F.softmax(next_token_logits, dim=-1) 
        top_k_probs, top_k_ids = torch.topk(next_token_probs, beam_size) 
        # ic(F.softmax(next_token_logits, dim=-1))
        # tensor([[3.3148e-24, 1.0000e+00, 1.0072e-22, 6.0241e-24, 1.4680e-20, 6.7340e-22,
        #                                            8.2570e-22, 1.3579e-23, 2.9239e-23, 6.4976e-21, 2.1458e-24, 3.4751e-22,
        #                                            2.5034e-24]]
        # ic(top_k_probs, top_k_ids)
        # top_k_probs: tensor([[1.]], grad_fn=<TopkBackward0>)
        # top_k_ids: tensor([[1]])

        # Prepare next sequences. Each top 1 token is appended to the first_sequence
        # ic(first_sequence.shape) #[1, 1]
        next_sequences = first_sequence.repeat_interleave(beam_size, dim=0)
        # ic(next_sequences.shape) #[10, 1] 10 is beam size, 1 is seq length
        next_sequences = torch.cat([next_sequences, top_k_ids.view(-1, 1)], dim=-1)
        # ic(next_sequences.shape) #[10, 2] 10 is beam size, 2 is seq length
        # ic(next_sequences) 

        # Also prepare a tensor to hold the cumulative scores of each sequence, or the sum of the log probabilities of each token in the sequence
        sequence_scores = (torch.log(top_k_probs).view(-1))  #/ (1 + 1) ** alpha
        # ic(sequence_scores) #[  0.0000, -15.9837]

        # We'll need to repeat the pixel_values for each sequence in each beam
        pixel_value = pixel_value.repeat_interleave(beam_size, dim=0)  
        # ic(pixel_value.shape) #[10, 3, 224, 224], 10 is beam size, 3 is channel, 224 is image size

        for idx in range(max_length - 1):  # We already generated one token
            # ic(idx, '--------------------')
            outputs = self.forward_pass(pixel_value, next_sequences)
            next_token_logits = outputs['logits'][:, -1, :]  
            # ic(outputs['logits'].shape, outputs['logits']) #[2, 2, 13], batch, seq, vocab_size
            # ic(next_token_logits.shape, next_token_logits)

            
            # Apply temperature
            # next_token_logits /= temperature

            # Convert logits to probabilities and calculate new scores
            next_token_probs = F.softmax(next_token_logits, dim=-1) 
            # ic(next_token_probs.shape, next_token_probs) #[2, 13], batch, vocab_size
            next_token_scores = torch.log(next_token_probs)
            # ic(next_token_scores.shape, next_token_scores) #[2, 13], batch, vocab_size

            new_scores = sequence_scores.unsqueeze(1) + next_token_scores
            # ic(sequence_scores.unsqueeze(1))
            # ic(new_scores.shape, new_scores) #[2, 13], batch, vocab_size

            
            # Select top k sequences
            # ic(new_scores.view(-1), new_scores.view(-1).shape)
            top_k_scores, top_k_indices = torch.topk(new_scores.view(-1), beam_size)  

            # ic(top_k_scores, top_k_indices)
            

            # Get the beam and token that each of the top k sequences comes from
            beams_indices = top_k_indices // self.cfg.num_tokens 
            token_indices = top_k_indices % self.cfg.num_tokens  
            # ic(beams_indices, token_indices)
            

            # Update pixel values, sequences, and scores
            # pixel_value = pixel_value[beams_indices]  
            # ic(next_sequences)
            next_sequences = next_sequences[beams_indices] 
            # ic(next_sequences)
            next_sequences = torch.cat([next_sequences, token_indices.unsqueeze(1)], dim=-1)
            # ic(next_sequences)
            sequence_scores = top_k_scores #/ (idx + 3) ** alpha

            # ic('-------------------')
            # if idx > 2: break

        # Select the best sequence
        max_score, max_score_idx = torch.max(sequence_scores, 0)
        # Select the sequence with the highest score
        best_sequence = next_sequences[max_score_idx]

        # ic(best_sequence, max_score)
        
        return best_sequence, max_score

This is portion of my class.

There are omitted code especially forward_pass however the code will work properly if you adapt this carefully.

And you can also capture some idea from here.

Thank you.

🙇🏻‍♂️

www.marearts.com

7/04/2023

CrossEntropyLoss example code using the input which similar with nlp token.

Refer to code

import torch
import torch.nn as nn

# Assume a batch size of 2 and a sequence length of 3, and the model's vocabulary size is 5.
# So, your predicted logits would have a shape of (batch size, sequence length, vocab size)

logits = torch.tensor([
    [[0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1], [0.1, 0.2, 0.3, 0.4, 0.5]],
    [[0.5, 0.4, 0.3, 0.2, 0.1], [0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1]]
])
logits = logits.view(-1, logits.shape[-1])  # Reshape logits to be 2D (N, C), where N is batch_size*seq_length, C is vocab_size

# Similarly, your labels would have a shape of (batch size, sequence length).
# These are example labels.

labels = torch.tensor([
    [0, 1, 2],
    [2, 1, 0]
])
labels = labels.view(-1)  # Reshape labels to be 1D (N)

loss_function = nn.CrossEntropyLoss()  # Initialize loss function
loss = loss_function(logits, labels)  # Compute the loss

print(loss)  # Print the loss

In this example, logits and labels are explicitly defined tensors. The values in logits represent the output from your model for each token in the sequence for each example in your batch, and the labels tensor represents the correct labels or classes for each of these tokens. nn.CrossEntropyLoss() is then used to compute the loss between the predicted logits and the actual labels.

Thank you.

🙇🏻‍♂️

Tokeniser example source code using "BertWordPieceTokenizer", "sentencepiece"

BertWordPieceTokenizer Toeknizer code

import os
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer(strip_accents=False, lowercase=False)

corpus_file   = ['./ratings.txt']  # data path
vocab_size    = 32000   #vocab의 크기. 보통 32,000이 좋다고 알려짐.
limit_alphabet= 6000    #merge 수행 전 initial tokens이 유지되는 숫자 제한
output_path   = 'hugging_%d'%(vocab_size)
min_frequency = 5  # 단어의 최소 발생 빈도
hf_model_path = './'

tokenizer.train(files=corpus_file,
               vocab_size=vocab_size,
               min_frequency=min_frequency,
               limit_alphabet=limit_alphabet, 
               show_progress=True)

tokenizer.save_model(hf_model_path)

BertWordPiece Tokenizer test

from transformers import BertTokenizerFast

hf_model_path = './'
tokenizer = BertTokenizerFast.from_pretrained(hf_model_path, strip_accents=False,
                                              lowercase = False)

text = "네이버 영화 평가 문장으로 토크나이저"
tokenized_input_for_pytorch = tokenizer(text, return_tensors='pt')

print("Tokens (str)      : {}".format([tokenizer.convert_ids_to_tokens(s) for s in tokenized_input_for_pytorch['input_ids'].tolist()[0]]))
print("Tokens (int)      : {}".format(tokenized_input_for_pytorch['input_ids'].tolist()[0]))
print("Tokens (attn_mask): {}\n".format(tokenized_input_for_pytorch['attention_mask'].tolist()[0]))

# Tokens (str)      : ['[CLS]', '네이버', '영화', '평가', '문', '##장', '##으로', '토', '##크', '##나이', '##저', '[SEP]']
# Tokens (int)      : [2, 6818, 5834, 6947, 1528, 3340, 5842, 2899, 3390, 8801, 3755, 3]
# Tokens (attn_mask): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Sentence piece Tokenizer

import sentencepiece as spm
import os

input_file = './ratings.txt'
vocab_size = 32000

sp_model_root = 'sentencepiece'
if not os.path.isdir(sp_model_root):
    os.mkdir(sp_model_root)
sp_model_name = 'tokenizer_%d' % (vocab_size)
sp_model_path = os.path.join(sp_model_root, sp_model_name)
model_type = 'unigram' #unigram, bpe
character_coverage = 1.0 #default=0.9995
user_defined_symbols = '[PAD],[UNK],[CLS],[SEP],[MASK],[BOS],[EOS],[UNK0],[UNK1],[UNK2],[UNK3],[UNK4],[UNK5],[UNK6],[UNK7],[UNK8],[UNK9],[unused0],[unused1],[unused2],[unused3],[unused4],[unused5],[unused6],[unused7],[unused8],[unused9],[unused10],[unused11],[unused12],[unused13],[unused14],[unused15],[unused16],[unused17],[unused18],[unused19],[unused20],[unused21],[unused22],[unused23],[unused24],[unused25],[unused26],[unused27],[unused28],[unused29],[unused30],[unused31],[unused32],[unused33],[unused34],[unused35],[unused36],[unused37],[unused38],[unused39],[unused40],[unused41],[unused42],[unused43],[unused44],[unused45],[unused46],[unused47],[unused48],[unused49],[unused50],[unused51],[unused52],[unused53],[unused54],[unused55],[unused56],[unused57],[unused58],[unused59],[unused60],[unused61],[unused62],[unused63],[unused64],[unused65],[unused66],[unused67],[unused68],[unused69],[unused70],[unused71],[unused72],[unused73],[unused74],[unused75],[unused76],[unused77],[unused78],[unused79],[unused80],[unused81],[unused82],[unused83],[unused84],[unused85],[unused86],[unused87],[unused88],[unused89],[unused90],[unused91],[unused92],[unused93],[unused94],[unused95],[unused96],[unused97],[unused98],[unused99]'

input_argument = '--input=%s --model_prefix=%s --vocab_size=%s --user_defined_symbols=%s --model_type=%s --character_coverage=%s'
cmd = input_argument%(input_file, sp_model_path, vocab_size,user_defined_symbols, model_type, character_coverage)

spm.SentencePieceTrainer.Train(cmd)

Sentence piece Tokenizer test

import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load('{}.model'.format('./sentencepiece/tokenizer_32000'))

text = "네이버 영화 평가 문장으로 토크나이저"
tokens = sp.encode_as_pieces(text)
ids = sp.encode_as_ids(text)

print("Tokens (str)      : {}".format(tokens))
print("Tokens (int)      : {}".format(ids))

# Tokens (str)      : ['▁네이버', '▁영화', '▁평가', '▁문', '장', '으로', '▁', '토크', '나이', '저']
# Tokens (int)      : [1209, 126, 2353, 3552, 412, 166, 123, 22627, 6361, 725]

Thank you.

🙇🏻‍♂️

6/29/2023

text summarise dataset

**Paper:**

https://arxiv.org/abs/1908.08345

**Dataset:**

1) the CNN/DailyMail news highlights dataset: somewhat Extractive

- News Articles & Related Highlights: Provides a brief overview of articles

- Input document: limited to 512 tokens

- https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail

2) the New York Times Annotated Corpus (NYT): somewhat Extractive

- Contains 110,540 articles with abstract summaries

- Input document : limited to 800 tokens

- https://research.google/resources/datasets/ny-times-annotated-corpus/

3) XSum: Abstractive

- 226,711 news articles answering the question of ‘What is this articles about?’ + one-sentence summaries

- Input document: limited to 512 tokens

- https://github.com/google-research-datasets/xsum_hallucination_annotations

5/23/2023

Create custom tokenizer simple code.

In the sample code, vocabulary is "0,1,2,3,4" and max length is 20.

from typing import List, Union

class CustomTokenizer:
    def __init__(self, vocab: Union[str, List[str]], pad_token="<PAD>", cls_token="<BOS>", sep_token="<SEP>", max_len=20):
        if isinstance(vocab, str):
            with open(vocab, 'r') as f:
                self.vocab = {word.strip(): i for i, word in enumerate(f.readlines())}
        elif isinstance(vocab, list):
            self.vocab = {word: i for i, word in enumerate(vocab)}
        else:
            raise ValueError("vocab must be either a filepath (str) or a list of words")
        
        print('vocab: ', self.vocab)
        self.pad_token = pad_token
        self.cls_token = cls_token
        self.sep_token = sep_token
        self.max_len = max_len
        self.inv_vocab = {v: k for k, v in self.vocab.items()}

    def tokenize(self, text: str):
        tokens = [c for c in text if c in self.vocab]
        tokens = tokens[:self.max_len]
        padding_length = self.max_len - len(tokens)
        return [self.cls_token] + tokens + [self.sep_token] + [self.pad_token] * padding_length

    def convert_tokens_to_ids(self, tokens):
        return [self.vocab.get(token, self.vocab.get(self.pad_token)) for token in tokens]

    def convert_ids_to_tokens(self, ids):
        return [self.inv_vocab.get(id, self.pad_token) for id in ids]



vocab = ["<PAD>", "<BOS>", "<SEP>", "0", "1", "2", "3", "4"]
with open('vocab.txt', 'w') as f:
    for token in vocab:
        f.write(token + '\n')

# Initialize your custom tokenizer
tokenizer = CustomTokenizer(vocab='vocab.txt')

# Now you can use this tokenizer to tokenize your data, study.marearts.com
tokenized_text = tokenizer.tokenize('22342')
print("tokenized_text: ", tokenized_text)

# Convert tokens to ids
token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
print("token_ids: ", token_ids)

# Convert ids back to tokens, marearts.com
tokens = tokenizer.convert_ids_to_tokens(token_ids)
print("tokens: ", tokens)

Thank you.

🙇🏻‍♂️

5/13/2022

convert simple transformer ner model to onnx

!python -m transformers.onnx --model=./checkpoint-21-epoch-11 --feature=token-classification onnx/

tokens to word, transformer

Refer to code to figure it out

how tokens consisted for a word.

Code show you tokens list for a word.

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

example = "This is a tokenization example"

print('input sentence: ', example)
print('---')
print('tokens :')
print( tokenizer.encode(example, add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False) )
print('---')
print('word and tokens :')
print({x : tokenizer.encode(x, add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False) for x in example.split()})
print('---')
idx = 1
enc =[tokenizer.encode(x, add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False) for x in example.split()]
desired_output = []
for token in enc:
    tokenoutput = []
    for ids in token:
        tokenoutput.append(idx)
        idx +=1
    desired_output.append(tokenoutput)

print('tokens in grouped list')
print(desired_output)
print('---')

input sentence:  This is a tokenization example
---
tokens :
[713, 16, 10, 19233, 1938, 1246]
---
word and tokens :
{'This': [713], 'is': [354], 'a': [102], 'tokenization': [46657, 1938], 'example': [46781]}
---
tokens in grouped list
[[1], [2], [3], [4, 5], [6]]
---

Thank you.

www.marearts.com

5/09/2022

BERT Tokenizer, string to token, token to string

BERT Tokenizer token understanding examples

text = "I am e/mail"
# text = "I am a e-mail"
tokens = tokenizer.tokenize(text)
print(f'Tokens: {tokens}')
print(f'Tokens length: {len(tokens)}')
encoding = tokenizer.encode(text)
print(f'Encoding: {encoding}')
print(f'Encoding length: {len(encoding)}')
tok_text = tokenizer.convert_tokens_to_string(tokens) 
print(f'token to string: {tok_text}')

output:

Tokens: ['I', 'Ġam', 'Ġe', '/', 'mail']
Tokens length: 5
Encoding: [0, 100, 524, 364, 73, 6380, 2]
Encoding length: 7
token to string: I am e/mail

--

Thank you.

www.marearts.com

Pages

7/13/2023

7/04/2023

6/29/2023

5/23/2023

5/13/2022

5/09/2022