MareArts Computer Vision Study.: hugging face

Showing posts with label hugging face. Show all posts

7/23/2024

download all files in specific folder from hugging face model

refer to code:

# Download all files from the IP-Adapter/sdxl_models folder
from huggingface_hub import snapshot_download

# Download the sdxl_models folder and its contents
snapshot_download(
    repo_id="h94/IP-Adapter",
    repo_type="model",
    local_dir="./IP-Adapter_sdxl_models",
    allow_patterns=["sdxl_models/*"]
)

It download all files under the sdxl_models folder.

Thank you.

7/19/2024

Download specific model from hugging face

refer to code

import os
import shutil
from huggingface_hub import hf_hub_download

# Repository name
repo_id = "h94/IP-Adapter"
# Directory to save the downloaded files
local_directory = "./models/image_encoder"

# Ensure the local directory exists
os.makedirs(local_directory, exist_ok=True)

# List of files to download
files_to_download = [
    "models/image_encoder/config.json",
    "models/image_encoder/model.safetensors",
    "models/image_encoder/pytorch_model.bin"
]

# Download each file and move it to the desired directory
for file in files_to_download:
    file_path = hf_hub_download(repo_id=repo_id, filename=file)
    # Construct the destination path
    dest_path = os.path.join(local_directory, os.path.basename(file))
    # Move the file to the destination path
    shutil.move(file_path, dest_path)
    print(f"Downloaded and moved to {dest_path}")

Thank you.

other option is

file_path = hf_hub_download(repo_id=repo_id, filename=file, cache_dir=local_directory, force_download=True)

7/15/2023

combine costum fc with hugging face model, good to remember and modify for modifications

refer to code:

    def model_forward(self, pixel_values, labels):
        # Origin vit encoder-decoder outputs
        outputs = self.model(pixel_values=pixel_values, labels=labels, output_hidden_states=True)
        # Get last hidden state
        last_hidden_state = outputs.decoder_hidden_states[-1] # batch_size, seq_len, hidden_size, ex)5, 15, 768
        return last_hidden_state

    def fc_part(self, last_hidden_state):
        # Reshape the last hidden state
        reshaped_logits = last_hidden_state.view(-1, self.model.config.decoder.hidden_size) # batch_size*seq_len, hidden_size
        # Apply the fully connected layer
        new_logits = self.custom_decoder_fc(reshaped_logits) # batch_size*seq_len, vocab_size
        return new_logits

    def compute_loss(self, new_logits, labels):
        # Reshape labels to match logits dimension
        reshaped_labels = labels.view(-1) #batch_size, seq_len -> batch_size*seq_len
        # Calculate loss
        # [batch_size*seq_len, vocab_size] vs [batch_size*seq_len]  #ex) [70, 13] vs [70]
        loss = self.loss_f(new_logits, reshaped_labels) #scalar tensor
        return loss

    def forward_pass(self, pixel_values, labels):
        last_hidden_state = self.model_forward(pixel_values, labels) # batch_size, seq_len, hidden_size
        new_logits = self.fc_part(last_hidden_state) # batch_size*seq_len, vocab_size
        loss = self.compute_loss(new_logits, labels) # scalar tensor
        
        # Reshape new_logits to match labels dimension
        new_logits = new_logits.view(labels.shape[0], labels.shape[1], -1) # bathc_size, seq_len, vocab_size

        return {'logits':new_logits, 'loss':loss}

forward_pass do process step by step.

And in the end return last hidden states logits and loss.

Thank you.

www.marearts.com

🙇🏻‍♂️

7/13/2023

Beam search function for image to text or nlp inference purpose.

refer to code first.

#this beam search only deal with batch size 1
    def beam_search(self, pixel_value, max_length):
        beam_size = self.cfg.num_beams
        alpha = self.cfg.beam_alpha  # Length normalization coefficient
        temperature = self.cfg.beam_temp  # Temperature for softmax

        # Initialize input ids as bos_token_id
        first_sequence = torch.full((pixel_value.shape[0], 1), self.model.config.decoder_start_token_id).to(pixel_value.device)
        # ic(first_sequence) #tensor([[1]])

        # Predict second token id
        outputs = self.forward_pass(pixel_value, first_sequence)
        # ic(outputs.keys()) #dict_keys(['logits', 'loss'])
        # We only need the logits corresponding to the last prediction
        next_token_logits = outputs['logits'][:, -1, :]  
        # ic(outputs['logits'].shape) #[1, 1, 13] batch, seq, vocab_size
        # ic(outputs['logits'][:, -1, :].shape) #[1, 13] batch, vocab_size

        # Apply temperature
        # ic(next_token_logits) 
        # [-5.0641, 32.7805, -2.6743, -4.6459,  0.8130, -1.3443, -1.2016, -4.0770,
        #                         -3.5401,  0.2425, -5.3685, -1.8074, -5.2606]],
        # next_token_logits /= temperature
        # ic(next_token_logits) 
        # [-7.2344, 46.8292, -3.8204, -6.6370,  1.1614, -1.9205, -1.7166, -5.8243,
        #                         -5.0573,  0.3464, -7.6693, -2.5820, -7.5152]],

        # Select top k tokens
        next_token_probs = F.softmax(next_token_logits, dim=-1) 
        top_k_probs, top_k_ids = torch.topk(next_token_probs, beam_size) 
        # ic(F.softmax(next_token_logits, dim=-1))
        # tensor([[3.3148e-24, 1.0000e+00, 1.0072e-22, 6.0241e-24, 1.4680e-20, 6.7340e-22,
        #                                            8.2570e-22, 1.3579e-23, 2.9239e-23, 6.4976e-21, 2.1458e-24, 3.4751e-22,
        #                                            2.5034e-24]]
        # ic(top_k_probs, top_k_ids)
        # top_k_probs: tensor([[1.]], grad_fn=<TopkBackward0>)
        # top_k_ids: tensor([[1]])

        # Prepare next sequences. Each top 1 token is appended to the first_sequence
        # ic(first_sequence.shape) #[1, 1]
        next_sequences = first_sequence.repeat_interleave(beam_size, dim=0)
        # ic(next_sequences.shape) #[10, 1] 10 is beam size, 1 is seq length
        next_sequences = torch.cat([next_sequences, top_k_ids.view(-1, 1)], dim=-1)
        # ic(next_sequences.shape) #[10, 2] 10 is beam size, 2 is seq length
        # ic(next_sequences) 

        # Also prepare a tensor to hold the cumulative scores of each sequence, or the sum of the log probabilities of each token in the sequence
        sequence_scores = (torch.log(top_k_probs).view(-1))  #/ (1 + 1) ** alpha
        # ic(sequence_scores) #[  0.0000, -15.9837]

        # We'll need to repeat the pixel_values for each sequence in each beam
        pixel_value = pixel_value.repeat_interleave(beam_size, dim=0)  
        # ic(pixel_value.shape) #[10, 3, 224, 224], 10 is beam size, 3 is channel, 224 is image size

        for idx in range(max_length - 1):  # We already generated one token
            # ic(idx, '--------------------')
            outputs = self.forward_pass(pixel_value, next_sequences)
            next_token_logits = outputs['logits'][:, -1, :]  
            # ic(outputs['logits'].shape, outputs['logits']) #[2, 2, 13], batch, seq, vocab_size
            # ic(next_token_logits.shape, next_token_logits)

            
            # Apply temperature
            # next_token_logits /= temperature

            # Convert logits to probabilities and calculate new scores
            next_token_probs = F.softmax(next_token_logits, dim=-1) 
            # ic(next_token_probs.shape, next_token_probs) #[2, 13], batch, vocab_size
            next_token_scores = torch.log(next_token_probs)
            # ic(next_token_scores.shape, next_token_scores) #[2, 13], batch, vocab_size

            new_scores = sequence_scores.unsqueeze(1) + next_token_scores
            # ic(sequence_scores.unsqueeze(1))
            # ic(new_scores.shape, new_scores) #[2, 13], batch, vocab_size

            
            # Select top k sequences
            # ic(new_scores.view(-1), new_scores.view(-1).shape)
            top_k_scores, top_k_indices = torch.topk(new_scores.view(-1), beam_size)  

            # ic(top_k_scores, top_k_indices)
            

            # Get the beam and token that each of the top k sequences comes from
            beams_indices = top_k_indices // self.cfg.num_tokens 
            token_indices = top_k_indices % self.cfg.num_tokens  
            # ic(beams_indices, token_indices)
            

            # Update pixel values, sequences, and scores
            # pixel_value = pixel_value[beams_indices]  
            # ic(next_sequences)
            next_sequences = next_sequences[beams_indices] 
            # ic(next_sequences)
            next_sequences = torch.cat([next_sequences, token_indices.unsqueeze(1)], dim=-1)
            # ic(next_sequences)
            sequence_scores = top_k_scores #/ (idx + 3) ** alpha

            # ic('-------------------')
            # if idx > 2: break

        # Select the best sequence
        max_score, max_score_idx = torch.max(sequence_scores, 0)
        # Select the sequence with the highest score
        best_sequence = next_sequences[max_score_idx]

        # ic(best_sequence, max_score)
        
        return best_sequence, max_score

This is portion of my class.

There are omitted code especially forward_pass however the code will work properly if you adapt this carefully.

And you can also capture some idea from here.

Thank you.

🙇🏻‍♂️

www.marearts.com

Pages

7/23/2024

download all files in specific folder from hugging face model

7/19/2024

Download specific model from hugging face

7/15/2023

combine costum fc with hugging face model, good to remember and modify for modifications

7/13/2023

Beam search function for image to text or nlp inference purpose.