当前位置:网站首页>Clip usage

Clip usage

2022-06-10 11:39:00 MallocLu

Installation environment

# 1.  Environmental installation pytorch
# 2.  install tqdm
pip install ftfy regex tqdm
# 3.  install clip
pip install git+https://github.com/openai/CLIP.git
    
#  Intranet usage pip install git+https://github.91chi.fun/https://github.com/openai/CLIP.git

API

# 1.  Return the available model 
clip.available_models()
['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/[email protected]']

# 2.  Return the corresponding model and image converter 
model, preprocess = clip.load("ViT-B/32")

# 3. preprocess take Image convert to tensor[3, 224, 224], then unsqueeze(0) Turn into [batch_size, 3, 3, 224] Before you can import the model 
image = preprocess(Image.open("CLIP.png")).unsqueeze(0)

# 4.  Combine multiple sentences [batch_size] Each sentence of is transformed into a vector [batch_size, context_length]
#  Add a... At the beginning of each sentence BOS(49406) EOS(49407), Then fill to the length context_length( The default value is 77)
# ( If the length is greater than context_length-2, You need to set parameters truncate=True, Then the return value is BOS  Content  EOS, namely EOS Not cut off )
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device) # [3, 77]

# 5.  Get the features of multiple pictures 
image_features = model.encode_image(image)
   
# 6.  Get the features of multiple texts 
text_features = model.encode_text(text)

# 7.  obtain   Multiple pictures and multiple texts   Cosine similarity between (0~1)
logits_per_image, logits_per_text = model(image, text)

Each model shape&dtype

ViT-B/32
# 512 (224, 224)
image					torch.Size([B, 3, 224, 224])   torch.float32
text					torch.Size([B, 77])			  torch.int32
image_features			 torch.Size([B, 512]) 		   torch.float16
text_features			 torch.Size([B, 512])   	   torch.float16

ViT-B/16
# 512 (224, 224)
image					torch.Size([B, 3, 224, 224])   torch.float32
text					torch.Size([B, 77])			  torch.int32
image_features			 torch.Size([B, 512]) 		   torch.float16
text_features			 torch.Size([B, 512])   	   torch.float16

ViT-L/14
# 768 (224, 224)
image					torch.Size([B, 3, 224, 224])   torch.float32
text					torch.Size([B, 77])			  torch.int32
image_features			 torch.Size([B, 768]) 		   torch.float16
text_features			 torch.Size([B, 768])   	   torch.float16

ViT-L/14@336px
# 768 (336, 336)
image					torch.Size([B, 3, 336, 336])   torch.float32
text					torch.Size([B, 77])			  torch.int32
image_features			 torch.Size([B, 768]) 		   torch.float16
text_features			 torch.Size([B, 768])   	   torch.float16

Each model preprocess&tokenize

#  No matter what load Which model ,clip.tokenize The return values are the same 

# preprocess The difference is that the size of the returned image is different ,
#  therefore  ViT-B/32 ViT-B/16 ViT-L/14 Of preprocess Same return value ,ViT-L/[email protected] Of preprocess The return value is different from them 

Easy to use

Example 1

import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

image = preprocess(Image.open("CLIP.png")).unsqueeze(0).to(device)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)  # prints: [[0.9927937 0.00421068 0.00299572]]

Example 2

import os
import clip
import torch
from torchvision.datasets import CIFAR100

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

# Download the dataset
cifar100 = CIFAR100(root=os.path.expanduser("~/.cache"), download=True, train=False)

# Prepare the inputs
image, class_id = cifar100[3637]
image_input = preprocess(image).unsqueeze(0).to(device)
text_inputs = torch.cat([clip.tokenize(f"a photo of a {
      c}") for c in cifar100.classes]).to(device)

# Calculate features
with torch.no_grad():
    image_features = model.encode_image(image_input)
    text_features = model.encode_text(text_inputs)

# Pick the top 5 most similar labels for the image
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
#  Take... Here 100 It's not what it takes. , It means that the similarity is expressed as a percentage 
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
values, indices = similarity[0].topk(5)

# Print the result
print("\nTop predictions:\n")
for value, index in zip(values, indices):
    print(f"{
      cifar100.classes[index]:>16s}: {
      100 * value.item():.2f}%")

Example 3

import os
import clip
import torch

import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR100
from tqdm import tqdm

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

# Load the dataset
root = os.path.expanduser("~/.cache")
train = CIFAR100(root, download=True, train=True, transform=preprocess)
test = CIFAR100(root, download=True, train=False, transform=preprocess)


def get_features(dataset):
    all_features = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in tqdm(DataLoader(dataset, batch_size=100)):
            features = model.encode_image(images.to(device))

            all_features.append(features)
            all_labels.append(labels)

    return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

# Calculate the image features
train_features, train_labels = get_features(train)
test_features, test_labels = get_features(test)

# Perform logistic regression
classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1)
classifier.fit(train_features, train_labels)

# Evaluate using the logistic regression classifier
predictions = classifier.predict(test_features)
accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
print(f"Accuracy = {
      accuracy:.3f}")

( important ) Fix or renew CLIP Parameters

About detach

#  Because our model only uses CLIP Visual encoder , So we only output whether the parameters of the visual encoder have changed 
#  Do not open position 1 And location 2, All output False, That is, all parameters have been updated 
#  Only open position 1,CLIP The parameters for True,Linear by False, namely Linear Parameter update for 
#  Only open position 2,CLIP The parameters for Flase,Linear by True, That is, only CLIP Parameter update for 


import os
import clip
from torch import nn
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10
from torch.nn import functional as F
import torch


class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.model, self.preprocess = clip.load('ViT-B/32', 'cpu')
        self.linear = nn.Linear(512, 10)

        #  Location 2
        # for param in self.linear.parameters():
        # param.requires_grad = False

    def forward(self, x):
        features = self.model.encode_image(x)

        #  Location 1
        # features = features.detach()

        return self.linear(features)


net = Net()
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2)

root = os.path.expanduser("~/.cache")
train = CIFAR10(root, download=True, train=True, transform=net.preprocess)
train = next(iter(DataLoader(train, batch_size=8)))

storeParam = {
    }
for name, param in net.model.visual.named_parameters():
    storeParam[name] = param.detach().clone()
for name, param in net.linear.named_parameters():
    storeParam[name] = param.detach().clone()

for i in range(10):
    out = net(train[0])
    loss = F.cross_entropy(out, train[1])

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(loss.item())

for name, param in net.model.visual.named_parameters():
    print(f"{
      name} {
      torch.equal(param, storeParam[name])}")
for name, param in net.linear.named_parameters():
    print(f"{
      name} {
      torch.equal(param, storeParam[name])}")

CLIP The layer structure

VIT-B/32

CLIP(
  #  Picture related 
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (2): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (3): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (4): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (5): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (6): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (7): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (8): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (9): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (10): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (11): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
    
  #  Text correlation 
  (transformer): Transformer(
    (resblocks): Sequential(
      (0): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (1): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (2): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (3): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (4): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (5): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (6): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (7): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (8): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (9): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (10): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (11): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (token_embedding): Embedding(49408, 512)
  (ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)

VIT-B/16

CLIP(
  #  Picture related  
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (2): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (3): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (4): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (5): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (6): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (7): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (8): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (9): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (10): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (11): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  
  #  Text correlation 
  (token_embedding): Embedding(49408, 512)
  (transformer): Transformer(
    (resblocks): Sequential(
      (0): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (1): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (2): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (3): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (4): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (5): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (6): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (7): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (8): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (9): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (10): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (11): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)

VIT-L/14

CLIP(
  #  Picture related 
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
    (ln_pre): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (2): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (3): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (4): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (5): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (6): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (7): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (8): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (9): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (10): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (11): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (12): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (13): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (14): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (15): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (16): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (17): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (18): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (19): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (20): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (21): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (22): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (23): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (ln_post): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
    
  #  Text correlation 
  (transformer): Transformer(
    (resblocks): Sequential(
      (0): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (1): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (2): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (3): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (4): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (5): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (6): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (7): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (8): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (9): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (10): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (11): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (token_embedding): Embedding(49408, 768)
  (ln_final): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

VIT-L/[email protected]

CLIP(
  #  Picture related 
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
    (ln_pre): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (2): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (3): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (4): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (5): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (6): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (7): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (8): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (9): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (10): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (11): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (12): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (13): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (14): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (15): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (16): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (17): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (18): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (19): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (20): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (21): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (22): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (23): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (ln_post): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
   
  #  Text correlation 
  (transformer): Transformer(
    (resblocks): Sequential(
      (0): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (1): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (2): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (3): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (4): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (5): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (6): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (7): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (8): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (9): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (10): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (11): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (token_embedding): Embedding(49408, 768)
  (ln_final): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

CLIP Parameter structure

VIT-B

Include VIT-B/32 and VIT-B/16

#  Weight when calculating cosine similarity , The detailed code is :
# normalized features
# image_features = image_features / image_features.norm(dim=1, keepdim=True)
# text_features = text_features / text_features.norm(dim=1, keepdim=True)
# # cosine similarity as logits
# logit_scale = self.logit_scale.exp()
# logits_per_image = logit_scale * image_features @ text_features.t()
# logits_per_text = logits_per_image.t()
logit_scale


#  Picture related 
visual.conv1.weight
visual.class_embedding
visual.positional_embedding
visual.ln_pre.weight
visual.ln_pre.bias
visual.transformer.resblocks.0.attn.in_proj_weight
visual.transformer.resblocks.0.attn.in_proj_bias
visual.transformer.resblocks.0.attn.out_proj.weight
visual.transformer.resblocks.0.attn.out_proj.bias
visual.transformer.resblocks.0.ln_1.weight
visual.transformer.resblocks.0.ln_1.bias
visual.transformer.resblocks.0.mlp.c_fc.weight
visual.transformer.resblocks.0.mlp.c_fc.bias
visual.transformer.resblocks.0.mlp.c_proj.weight
visual.transformer.resblocks.0.mlp.c_proj.bias
visual.transformer.resblocks.0.ln_2.weight
visual.transformer.resblocks.0.ln_2.bias
visual.transformer.resblocks.1.attn.in_proj_weight
visual.transformer.resblocks.1.attn.in_proj_bias
visual.transformer.resblocks.1.attn.out_proj.weight
visual.transformer.resblocks.1.attn.out_proj.bias
visual.transformer.resblocks.1.ln_1.weight
visual.transformer.resblocks.1.ln_1.bias
visual.transformer.resblocks.1.mlp.c_fc.weight
visual.transformer.resblocks.1.mlp.c_fc.bias
visual.transformer.resblocks.1.mlp.c_proj.weight
visual.transformer.resblocks.1.mlp.c_proj.bias
visual.transformer.resblocks.1.ln_2.weight
visual.transformer.resblocks.1.ln_2.bias
visual.transformer.resblocks.2.attn.in_proj_weight
visual.transformer.resblocks.2.attn.in_proj_bias
visual.transformer.resblocks.2.attn.out_proj.weight
visual.transformer.resblocks.2.attn.out_proj.bias
visual.transformer.resblocks.2.ln_1.weight
visual.transformer.resblocks.2.ln_1.bias
visual.transformer.resblocks.2.mlp.c_fc.weight
visual.transformer.resblocks.2.mlp.c_fc.bias
visual.transformer.resblocks.2.mlp.c_proj.weight
visual.transformer.resblocks.2.mlp.c_proj.bias
visual.transformer.resblocks.2.ln_2.weight
visual.transformer.resblocks.2.ln_2.bias
visual.transformer.resblocks.3.attn.in_proj_weight
visual.transformer.resblocks.3.attn.in_proj_bias
visual.transformer.resblocks.3.attn.out_proj.weight
visual.transformer.resblocks.3.attn.out_proj.bias
visual.transformer.resblocks.3.ln_1.weight
visual.transformer.resblocks.3.ln_1.bias
visual.transformer.resblocks.3.mlp.c_fc.weight
visual.transformer.resblocks.3.mlp.c_fc.bias
visual.transformer.resblocks.3.mlp.c_proj.weight
visual.transformer.resblocks.3.mlp.c_proj.bias
visual.transformer.resblocks.3.ln_2.weight
visual.transformer.resblocks.3.ln_2.bias
visual.transformer.resblocks.4.attn.in_proj_weight
visual.transformer.resblocks.4.attn.in_proj_bias
visual.transformer.resblocks.4.attn.out_proj.weight
visual.transformer.resblocks.4.attn.out_proj.bias
visual.transformer.resblocks.4.ln_1.weight
visual.transformer.resblocks.4.ln_1.bias
visual.transformer.resblocks.4.mlp.c_fc.weight
visual.transformer.resblocks.4.mlp.c_fc.bias
visual.transformer.resblocks.4.mlp.c_proj.weight
visual.transformer.resblocks.4.mlp.c_proj.bias
visual.transformer.resblocks.4.ln_2.weight
visual.transformer.resblocks.4.ln_2.bias
visual.transformer.resblocks.5.attn.in_proj_weight
visual.transformer.resblocks.5.attn.in_proj_bias
visual.transformer.resblocks.5.attn.out_proj.weight
visual.transformer.resblocks.5.attn.out_proj.bias
visual.transformer.resblocks.5.ln_1.weight
visual.transformer.resblocks.5.ln_1.bias
visual.transformer.resblocks.5.mlp.c_fc.weight
visual.transformer.resblocks.5.mlp.c_fc.bias
visual.transformer.resblocks.5.mlp.c_proj.weight
visual.transformer.resblocks.5.mlp.c_proj.bias
visual.transformer.resblocks.5.ln_2.weight
visual.transformer.resblocks.5.ln_2.bias
visual.transformer.resblocks.6.attn.in_proj_weight
visual.transformer.resblocks.6.attn.in_proj_bias
visual.transformer.resblocks.6.attn.out_proj.weight
visual.transformer.resblocks.6.attn.out_proj.bias
visual.transformer.resblocks.6.ln_1.weight
visual.transformer.resblocks.6.ln_1.bias
visual.transformer.resblocks.6.mlp.c_fc.weight
visual.transformer.resblocks.6.mlp.c_fc.bias
visual.transformer.resblocks.6.mlp.c_proj.weight
visual.transformer.resblocks.6.mlp.c_proj.bias
visual.transformer.resblocks.6.ln_2.weight
visual.transformer.resblocks.6.ln_2.bias
visual.transformer.resblocks.7.attn.in_proj_weight
visual.transformer.resblocks.7.attn.in_proj_bias
visual.transformer.resblocks.7.attn.out_proj.weight
visual.transformer.resblocks.7.attn.out_proj.bias
visual.transformer.resblocks.7.ln_1.weight
visual.transformer.resblocks.7.ln_1.bias
visual.transformer.resblocks.7.mlp.c_fc.weight
visual.transformer.resblocks.7.mlp.c_fc.bias
visual.transformer.resblocks.7.mlp.c_proj.weight
visual.transformer.resblocks.7.mlp.c_proj.bias
visual.transformer.resblocks.7.ln_2.weight
visual.transformer.resblocks.7.ln_2.bias
visual.transformer.resblocks.8.attn.in_proj_weight
visual.transformer.resblocks.8.attn.in_proj_bias
visual.transformer.resblocks.8.attn.out_proj.weight
visual.transformer.resblocks.8.attn.out_proj.bias
visual.transformer.resblocks.8.ln_1.weight
visual.transformer.resblocks.8.ln_1.bias
visual.transformer.resblocks.8.mlp.c_fc.weight
visual.transformer.resblocks.8.mlp.c_fc.bias
visual.transformer.resblocks.8.mlp.c_proj.weight
visual.transformer.resblocks.8.mlp.c_proj.bias
visual.transformer.resblocks.8.ln_2.weight
visual.transformer.resblocks.8.ln_2.bias
visual.transformer.resblocks.9.attn.in_proj_weight
visual.transformer.resblocks.9.attn.in_proj_bias
visual.transformer.resblocks.9.attn.out_proj.weight
visual.transformer.resblocks.9.attn.out_proj.bias
visual.transformer.resblocks.9.ln_1.weight
visual.transformer.resblocks.9.ln_1.bias
visual.transformer.resblocks.9.mlp.c_fc.weight
visual.transformer.resblocks.9.mlp.c_fc.bias
visual.transformer.resblocks.9.mlp.c_proj.weight
visual.transformer.resblocks.9.mlp.c_proj.bias
visual.transformer.resblocks.9.ln_2.weight
visual.transformer.resblocks.9.ln_2.bias
visual.transformer.resblocks.10.attn.in_proj_weight
visual.transformer.resblocks.10.attn.in_proj_bias
visual.transformer.resblocks.10.attn.out_proj.weight
visual.transformer.resblocks.10.attn.out_proj.bias
visual.transformer.resblocks.10.ln_1.weight
visual.transformer.resblocks.10.ln_1.bias
visual.transformer.resblocks.10.mlp.c_fc.weight
visual.transformer.resblocks.10.mlp.c_fc.bias
visual.transformer.resblocks.10.mlp.c_proj.weight
visual.transformer.resblocks.10.mlp.c_proj.bias
visual.transformer.resblocks.10.ln_2.weight
visual.transformer.resblocks.10.ln_2.bias
visual.transformer.resblocks.11.attn.in_proj_weight
visual.transformer.resblocks.11.attn.in_proj_bias
visual.transformer.resblocks.11.attn.out_proj.weight
visual.transformer.resblocks.11.attn.out_proj.bias
visual.transformer.resblocks.11.ln_1.weight
visual.transformer.resblocks.11.ln_1.bias
visual.transformer.resblocks.11.mlp.c_fc.weight
visual.transformer.resblocks.11.mlp.c_fc.bias
visual.transformer.resblocks.11.mlp.c_proj.weight
visual.transformer.resblocks.11.mlp.c_proj.bias
visual.transformer.resblocks.11.ln_2.weight
visual.transformer.resblocks.11.ln_2.bias
visual.ln_post.weight
visual.ln_post.bias
visual.proj

#  Text correlation 
token_embedding.weight
positional_embedding
transformer.resblocks.0.attn.in_proj_weight
transformer.resblocks.0.attn.in_proj_bias
transformer.resblocks.0.attn.out_proj.weight
transformer.resblocks.0.attn.out_proj.bias
transformer.resblocks.0.ln_1.weight
transformer.resblocks.0.ln_1.bias
transformer.resblocks.0.mlp.c_fc.weight
transformer.resblocks.0.mlp.c_fc.bias
transformer.resblocks.0.mlp.c_proj.weight
transformer.resblocks.0.mlp.c_proj.bias
transformer.resblocks.0.ln_2.weight
transformer.resblocks.0.ln_2.bias
transformer.resblocks.1.attn.in_proj_weight
transformer.resblocks.1.attn.in_proj_bias
transformer.resblocks.1.attn.out_proj.weight
transformer.resblocks.1.attn.out_proj.bias
transformer.resblocks.1.ln_1.weight
transformer.resblocks.1.ln_1.bias
transformer.resblocks.1.mlp.c_fc.weight
transformer.resblocks.1.mlp.c_fc.bias
transformer.resblocks.1.mlp.c_proj.weight
transformer.resblocks.1.mlp.c_proj.bias
transformer.resblocks.1.ln_2.weight
transformer.resblocks.1.ln_2.bias
transformer.resblocks.2.attn.in_proj_weight
transformer.resblocks.2.attn.in_proj_bias
transformer.resblocks.2.attn.out_proj.weight
transformer.resblocks.2.attn.out_proj.bias
transformer.resblocks.2.ln_1.weight
transformer.resblocks.2.ln_1.bias
transformer.resblocks.2.mlp.c_fc.weight
transformer.resblocks.2.mlp.c_fc.bias
transformer.resblocks.2.mlp.c_proj.weight
transformer.resblocks.2.mlp.c_proj.bias
transformer.resblocks.2.ln_2.weight
transformer.resblocks.2.ln_2.bias
transformer.resblocks.3.attn.in_proj_weight
transformer.resblocks.3.attn.in_proj_bias
transformer.resblocks.3.attn.out_proj.weight
transformer.resblocks.3.attn.out_proj.bias
transformer.resblocks.3.ln_1.weight
transformer.resblocks.3.ln_1.bias
transformer.resblocks.3.mlp.c_fc.weight
transformer.resblocks.3.mlp.c_fc.bias
transformer.resblocks.3.mlp.c_proj.weight
transformer.resblocks.3.mlp.c_proj.bias
transformer.resblocks.3.ln_2.weight
transformer.resblocks.3.ln_2.bias
transformer.resblocks.4.attn.in_proj_weight
transformer.resblocks.4.attn.in_proj_bias
transformer.resblocks.4.attn.out_proj.weight
transformer.resblocks.4.attn.out_proj.bias
transformer.resblocks.4.ln_1.weight
transformer.resblocks.4.ln_1.bias
transformer.resblocks.4.mlp.c_fc.weight
transformer.resblocks.4.mlp.c_fc.bias
transformer.resblocks.4.mlp.c_proj.weight
transformer.resblocks.4.mlp.c_proj.bias
transformer.resblocks.4.ln_2.weight
transformer.resblocks.4.ln_2.bias
transformer.resblocks.5.attn.in_proj_weight
transformer.resblocks.5.attn.in_proj_bias
transformer.resblocks.5.attn.out_proj.weight
transformer.resblocks.5.attn.out_proj.bias
transformer.resblocks.5.ln_1.weight
transformer.resblocks.5.ln_1.bias
transformer.resblocks.5.mlp.c_fc.weight
transformer.resblocks.5.mlp.c_fc.bias
transformer.resblocks.5.mlp.c_proj.weight
transformer.resblocks.5.mlp.c_proj.bias
transformer.resblocks.5.ln_2.weight
transformer.resblocks.5.ln_2.bias
transformer.resblocks.6.attn.in_proj_weight
transformer.resblocks.6.attn.in_proj_bias
transformer.resblocks.6.attn.out_proj.weight
transformer.resblocks.6.attn.out_proj.bias
transformer.resblocks.6.ln_1.weight
transformer.resblocks.6.ln_1.bias
transformer.resblocks.6.mlp.c_fc.weight
transformer.resblocks.6.mlp.c_fc.bias
transformer.resblocks.6.mlp.c_proj.weight
transformer.resblocks.6.mlp.c_proj.bias
transformer.resblocks.6.ln_2.weight
transformer.resblocks.6.ln_2.bias
transformer.resblocks.7.attn.in_proj_weight
transformer.resblocks.7.attn.in_proj_bias
transformer.resblocks.7.attn.out_proj.weight
transformer.resblocks.7.attn.out_proj.bias
transformer.resblocks.7.ln_1.weight
transformer.resblocks.7.ln_1.bias
transformer.resblocks.7.mlp.c_fc.weight
transformer.resblocks.7.mlp.c_fc.bias
transformer.resblocks.7.mlp.c_proj.weight
transformer.resblocks.7.mlp.c_proj.bias
transformer.resblocks.7.ln_2.weight
transformer.resblocks.7.ln_2.bias
transformer.resblocks.8.attn.in_proj_weight
transformer.resblocks.8.attn.in_proj_bias
transformer.resblocks.8.attn.out_proj.weight
transformer.resblocks.8.attn.out_proj.bias
transformer.resblocks.8.ln_1.weight
transformer.resblocks.8.ln_1.bias
transformer.resblocks.8.mlp.c_fc.weight
transformer.resblocks.8.mlp.c_fc.bias
transformer.resblocks.8.mlp.c_proj.weight
transformer.resblocks.8.mlp.c_proj.bias
transformer.resblocks.8.ln_2.weight
transformer.resblocks.8.ln_2.bias
transformer.resblocks.9.attn.in_proj_weight
transformer.resblocks.9.attn.in_proj_bias
transformer.resblocks.9.attn.out_proj.weight
transformer.resblocks.9.attn.out_proj.bias
transformer.resblocks.9.ln_1.weight
transformer.resblocks.9.ln_1.bias
transformer.resblocks.9.mlp.c_fc.weight
transformer.resblocks.9.mlp.c_fc.bias
transformer.resblocks.9.mlp.c_proj.weight
transformer.resblocks.9.mlp.c_proj.bias
transformer.resblocks.9.ln_2.weight
transformer.resblocks.9.ln_2.bias
transformer.resblocks.10.attn.in_proj_weight
transformer.resblocks.10.attn.in_proj_bias
transformer.resblocks.10.attn.out_proj.weight
transformer.resblocks.10.attn.out_proj.bias
transformer.resblocks.10.ln_1.weight
transformer.resblocks.10.ln_1.bias
transformer.resblocks.10.mlp.c_fc.weight
transformer.resblocks.10.mlp.c_fc.bias
transformer.resblocks.10.mlp.c_proj.weight
transformer.resblocks.10.mlp.c_proj.bias
transformer.resblocks.10.ln_2.weight
transformer.resblocks.10.ln_2.bias
transformer.resblocks.11.attn.in_proj_weight
transformer.resblocks.11.attn.in_proj_bias
transformer.resblocks.11.attn.out_proj.weight
transformer.resblocks.11.attn.out_proj.bias
transformer.resblocks.11.ln_1.weight
transformer.resblocks.11.ln_1.bias
transformer.resblocks.11.mlp.c_fc.weight
transformer.resblocks.11.mlp.c_fc.bias
transformer.resblocks.11.mlp.c_proj.weight
transformer.resblocks.11.mlp.c_proj.bias
transformer.resblocks.11.ln_2.weight
transformer.resblocks.11.ln_2.bias
ln_final.weight
ln_final.bias
text_projection

VIT-L

Include VIT-L/14 and VIT-L/[email protected]

logit_scale

#  Picture related 
visual.conv1.weight
visual.class_embedding
visual.positional_embedding
visual.ln_pre.weight
visual.ln_pre.bias
visual.transformer.resblocks.0.attn.in_proj_weight
visual.transformer.resblocks.0.attn.in_proj_bias
visual.transformer.resblocks.0.attn.out_proj.weight
visual.transformer.resblocks.0.attn.out_proj.bias
visual.transformer.resblocks.0.ln_1.weight
visual.transformer.resblocks.0.ln_1.bias
visual.transformer.resblocks.0.mlp.c_fc.weight
visual.transformer.resblocks.0.mlp.c_fc.bias
visual.transformer.resblocks.0.mlp.c_proj.weight
visual.transformer.resblocks.0.mlp.c_proj.bias
visual.transformer.resblocks.0.ln_2.weight
visual.transformer.resblocks.0.ln_2.bias
visual.transformer.resblocks.1.attn.in_proj_weight
visual.transformer.resblocks.1.attn.in_proj_bias
visual.transformer.resblocks.1.attn.out_proj.weight
visual.transformer.resblocks.1.attn.out_proj.bias
visual.transformer.resblocks.1.ln_1.weight
visual.transformer.resblocks.1.ln_1.bias
visual.transformer.resblocks.1.mlp.c_fc.weight
visual.transformer.resblocks.1.mlp.c_fc.bias
visual.transformer.resblocks.1.mlp.c_proj.weight
visual.transformer.resblocks.1.mlp.c_proj.bias
visual.transformer.resblocks.1.ln_2.weight
visual.transformer.resblocks.1.ln_2.bias
visual.transformer.resblocks.2.attn.in_proj_weight
visual.transformer.resblocks.2.attn.in_proj_bias
visual.transformer.resblocks.2.attn.out_proj.weight
visual.transformer.resblocks.2.attn.out_proj.bias
visual.transformer.resblocks.2.ln_1.weight
visual.transformer.resblocks.2.ln_1.bias
visual.transformer.resblocks.2.mlp.c_fc.weight
visual.transformer.resblocks.2.mlp.c_fc.bias
visual.transformer.resblocks.2.mlp.c_proj.weight
visual.transformer.resblocks.2.mlp.c_proj.bias
visual.transformer.resblocks.2.ln_2.weight
visual.transformer.resblocks.2.ln_2.bias
visual.transformer.resblocks.3.attn.in_proj_weight
visual.transformer.resblocks.3.attn.in_proj_bias
visual.transformer.resblocks.3.attn.out_proj.weight
visual.transformer.resblocks.3.attn.out_proj.bias
visual.transformer.resblocks.3.ln_1.weight
visual.transformer.resblocks.3.ln_1.bias
visual.transformer.resblocks.3.mlp.c_fc.weight
visual.transformer.resblocks.3.mlp.c_fc.bias
visual.transformer.resblocks.3.mlp.c_proj.weight
visual.transformer.resblocks.3.mlp.c_proj.bias
visual.transformer.resblocks.3.ln_2.weight
visual.transformer.resblocks.3.ln_2.bias
visual.transformer.resblocks.4.attn.in_proj_weight
visual.transformer.resblocks.4.attn.in_proj_bias
visual.transformer.resblocks.4.attn.out_proj.weight
visual.transformer.resblocks.4.attn.out_proj.bias
visual.transformer.resblocks.4.ln_1.weight
visual.transformer.resblocks.4.ln_1.bias
visual.transformer.resblocks.4.mlp.c_fc.weight
visual.transformer.resblocks.4.mlp.c_fc.bias
visual.transformer.resblocks.4.mlp.c_proj.weight
visual.transformer.resblocks.4.mlp.c_proj.bias
visual.transformer.resblocks.4.ln_2.weight
visual.transformer.resblocks.4.ln_2.bias
visual.transformer.resblocks.5.attn.in_proj_weight
visual.transformer.resblocks.5.attn.in_proj_bias
visual.transformer.resblocks.5.attn.out_proj.weight
visual.transformer.resblocks.5.attn.out_proj.bias
visual.transformer.resblocks.5.ln_1.weight
visual.transformer.resblocks.5.ln_1.bias
visual.transformer.resblocks.5.mlp.c_fc.weight
visual.transformer.resblocks.5.mlp.c_fc.bias
visual.transformer.resblocks.5.mlp.c_proj.weight
visual.transformer.resblocks.5.mlp.c_proj.bias
visual.transformer.resblocks.5.ln_2.weight
visual.transformer.resblocks.5.ln_2.bias
visual.transformer.resblocks.6.attn.in_proj_weight
visual.transformer.resblocks.6.attn.in_proj_bias
visual.transformer.resblocks.6.attn.out_proj.weight
visual.transformer.resblocks.6.attn.out_proj.bias
visual.transformer.resblocks.6.ln_1.weight
visual.transformer.resblocks.6.ln_1.bias
visual.transformer.resblocks.6.mlp.c_fc.weight
visual.transformer.resblocks.6.mlp.c_fc.bias
visual.transformer.resblocks.6.mlp.c_proj.weight
visual.transformer.resblocks.6.mlp.c_proj.bias
visual.transformer.resblocks.6.ln_2.weight
visual.transformer.resblocks.6.ln_2.bias
visual.transformer.resblocks.7.attn.in_proj_weight
visual.transformer.resblocks.7.attn.in_proj_bias
visual.transformer.resblocks.7.attn.out_proj.weight
visual.transformer.resblocks.7.attn.out_proj.bias
visual.transformer.resblocks.7.ln_1.weight
visual.transformer.resblocks.7.ln_1.bias
visual.transformer.resblocks.7.mlp.c_fc.weight
visual.transformer.resblocks.7.mlp.c_fc.bias
visual.transformer.resblocks.7.mlp.c_proj.weight
visual.transformer.resblocks.7.mlp.c_proj.bias
visual.transformer.resblocks.7.ln_2.weight
visual.transformer.resblocks.7.ln_2.bias
visual.transformer.resblocks.8.attn.in_proj_weight
visual.transformer.resblocks.8.attn.in_proj_bias
visual.transformer.resblocks.8.attn.out_proj.weight
visual.transformer.resblocks.8.attn.out_proj.bias
visual.transformer.resblocks.8.ln_1.weight
visual.transformer.resblocks.8.ln_1.bias
visual.transformer.resblocks.8.mlp.c_fc.weight
visual.transformer.resblocks.8.mlp.c_fc.bias
visual.transformer.resblocks.8.mlp.c_proj.weight
visual.transformer.resblocks.8.mlp.c_proj.bias
visual.transformer.resblocks.8.ln_2.weight
visual.transformer.resblocks.8.ln_2.bias
visual.transformer.resblocks.9.attn.in_proj_weight
visual.transformer.resblocks.9.attn.in_proj_bias
visual.transformer.resblocks.9.attn.out_proj.weight
visual.transformer.resblocks.9.attn.out_proj.bias
visual.transformer.resblocks.9.ln_1.weight
visual.transformer.resblocks.9.ln_1.bias
visual.transformer.resblocks.9.mlp.c_fc.weight
visual.transformer.resblocks.9.mlp.c_fc.bias
visual.transformer.resblocks.9.mlp.c_proj.weight
visual.transformer.resblocks.9.mlp.c_proj.bias
visual.transformer.resblocks.9.ln_2.weight
visual.transformer.resblocks.9.ln_2.bias
visual.transformer.resblocks.10.attn.in_proj_weight
visual.transformer.resblocks.10.attn.in_proj_bias
visual.transformer.resblocks.10.attn.out_proj.weight
visual.transformer.resblocks.10.attn.out_proj.bias
visual.transformer.resblocks.10.ln_1.weight
visual.transformer.resblocks.10.ln_1.bias
visual.transformer.resblocks.10.mlp.c_fc.weight
visual.transformer.resblocks.10.mlp.c_fc.bias
visual.transformer.resblocks.10.mlp.c_proj.weight
visual.transformer.resblocks.10.mlp.c_proj.bias
visual.transformer.resblocks.10.ln_2.weight
visual.transformer.resblocks.10.ln_2.bias
visual.transformer.resblocks.11.attn.in_proj_weight
visual.transformer.resblocks.11.attn.in_proj_bias
visual.transformer.resblocks.11.attn.out_proj.weight
visual.transformer.resblocks.11.attn.out_proj.bias
visual.transformer.resblocks.11.ln_1.weight
visual.transformer.resblocks.11.ln_1.bias
visual.transformer.resblocks.11.mlp.c_fc.weight
visual.transformer.resblocks.11.mlp.c_fc.bias
visual.transformer.resblocks.11.mlp.c_proj.weight
visual.transformer.resblocks.11.mlp.c_proj.bias
visual.transformer.resblocks.11.ln_2.weight
visual.transformer.resblocks.11.ln_2.bias
visual.transformer.resblocks.12.attn.in_proj_weight
visual.transformer.resblocks.12.attn.in_proj_bias
visual.transformer.resblocks.12.attn.out_proj.weight
visual.transformer.resblocks.12.attn.out_proj.bias
visual.transformer.resblocks.12.ln_1.weight
visual.transformer.resblocks.12.ln_1.bias
visual.transformer.resblocks.12.mlp.c_fc.weight
visual.transformer.resblocks.12.mlp.c_fc.bias
visual.transformer.resblocks.12.mlp.c_proj.weight
visual.transformer.resblocks.12.mlp.c_proj.bias
visual.transformer.resblocks.12.ln_2.weight
visual.transformer.resblocks.12.ln_2.bias
visual.transformer.resblocks.13.attn.in_proj_weight
visual.transformer.resblocks.13.attn.in_proj_bias
visual.transformer.resblocks.13.attn.out_proj.weight
visual.transformer.resblocks.13.attn.out_proj.bias
visual.transformer.resblocks.13.ln_1.weight
visual.transformer.resblocks.13.ln_1.bias
visual.transformer.resblocks.13.mlp.c_fc.weight
visual.transformer.resblocks.13.mlp.c_fc.bias
visual.transformer.resblocks.13.mlp.c_proj.weight
visual.transformer.resblocks.13.mlp.c_proj.bias
visual.transformer.resblocks.13.ln_2.weight
visual.transformer.resblocks.13.ln_2.bias
visual.transformer.resblocks.14.attn.in_proj_weight
visual.transformer.resblocks.14.attn.in_proj_bias
visual.transformer.resblocks.14.attn.out_proj.weight
visual.transformer.resblocks.14.attn.out_proj.bias
visual.transformer.resblocks.14.ln_1.weight
visual.transformer.resblocks.14.ln_1.bias
visual.transformer.resblocks.14.mlp.c_fc.weight
visual.transformer.resblocks.14.mlp.c_fc.bias
visual.transformer.resblocks.14.mlp.c_proj.weight
visual.transformer.resblocks.14.mlp.c_proj.bias
visual.transformer.resblocks.14.ln_2.weight
visual.transformer.resblocks.14.ln_2.bias
visual.transformer.resblocks.15.attn.in_proj_weight
visual.transformer.resblocks.15.attn.in_proj_bias
visual.transformer.resblocks.15.attn.out_proj.weight
visual.transformer.resblocks.15.attn.out_proj.bias
visual.transformer.resblocks.15.ln_1.weight
visual.transformer.resblocks.15.ln_1.bias
visual.transformer.resblocks.15.mlp.c_fc.weight
visual.transformer.resblocks.15.mlp.c_fc.bias
visual.transformer.resblocks.15.mlp.c_proj.weight
visual.transformer.resblocks.15.mlp.c_proj.bias
visual.transformer.resblocks.15.ln_2.weight
visual.transformer.resblocks.15.ln_2.bias
visual.transformer.resblocks.16.attn.in_proj_weight
visual.transformer.resblocks.16.attn.in_proj_bias
visual.transformer.resblocks.16.attn.out_proj.weight
visual.transformer.resblocks.16.attn.out_proj.bias
visual.transformer.resblocks.16.ln_1.weight
visual.transformer.resblocks.16.ln_1.bias
visual.transformer.resblocks.16.mlp.c_fc.weight
visual.transformer.resblocks.16.mlp.c_fc.bias
visual.transformer.resblocks.16.mlp.c_proj.weight
visual.transformer.resblocks.16.mlp.c_proj.bias
visual.transformer.resblocks.16.ln_2.weight
visual.transformer.resblocks.16.ln_2.bias
visual.transformer.resblocks.17.attn.in_proj_weight
visual.transformer.resblocks.17.attn.in_proj_bias
visual.transformer.resblocks.17.attn.out_proj.weight
visual.transformer.resblocks.17.attn.out_proj.bias
visual.transformer.resblocks.17.ln_1.weight
visual.transformer.resblocks.17.ln_1.bias
visual.transformer.resblocks.17.mlp.c_fc.weight
visual.transformer.resblocks.17.mlp.c_fc.bias
visual.transformer.resblocks.17.mlp.c_proj.weight
visual.transformer.resblocks.17.mlp.c_proj.bias
visual.transformer.resblocks.17.ln_2.weight
visual.transformer.resblocks.17.ln_2.bias
visual.transformer.resblocks.18.attn.in_proj_weight
visual.transformer.resblocks.18.attn.in_proj_bias
visual.transformer.resblocks.18.attn.out_proj.weight
visual.transformer.resblocks.18.attn.out_proj.bias
visual.transformer.resblocks.18.ln_1.weight
visual.transformer.resblocks.18.ln_1.bias
visual.transformer.resblocks.18.mlp.c_fc.weight
visual.transformer.resblocks.18.mlp.c_fc.bias
visual.transformer.resblocks.18.mlp.c_proj.weight
visual.transformer.resblocks.18.mlp.c_proj.bias
visual.transformer.resblocks.18.ln_2.weight
visual.transformer.resblocks.18.ln_2.bias
visual.transformer.resblocks.19.attn.in_proj_weight
visual.transformer.resblocks.19.attn.in_proj_bias
visual.transformer.resblocks.19.attn.out_proj.weight
visual.transformer.resblocks.19.attn.out_proj.bias
visual.transformer.resblocks.19.ln_1.weight
visual.transformer.resblocks.19.ln_1.bias
visual.transformer.resblocks.19.mlp.c_fc.weight
visual.transformer.resblocks.19.mlp.c_fc.bias
visual.transformer.resblocks.19.mlp.c_proj.weight
visual.transformer.resblocks.19.mlp.c_proj.bias
visual.transformer.resblocks.19.ln_2.weight
visual.transformer.resblocks.19.ln_2.bias
visual.transformer.resblocks.20.attn.in_proj_weight
visual.transformer.resblocks.20.attn.in_proj_bias
visual.transformer.resblocks.20.attn.out_proj.weight
visual.transformer.resblocks.20.attn.out_proj.bias
visual.transformer.resblocks.20.ln_1.weight
visual.transformer.resblocks.20.ln_1.bias
visual.transformer.resblocks.20.mlp.c_fc.weight
visual.transformer.resblocks.20.mlp.c_fc.bias
visual.transformer.resblocks.20.mlp.c_proj.weight
visual.transformer.resblocks.20.mlp.c_proj.bias
visual.transformer.resblocks.20.ln_2.weight
visual.transformer.resblocks.20.ln_2.bias
visual.transformer.resblocks.21.attn.in_proj_weight
visual.transformer.resblocks.21.attn.in_proj_bias
visual.transformer.resblocks.21.attn.out_proj.weight
visual.transformer.resblocks.21.attn.out_proj.bias
visual.transformer.resblocks.21.ln_1.weight
visual.transformer.resblocks.21.ln_1.bias
visual.transformer.resblocks.21.mlp.c_fc.weight
visual.transformer.resblocks.21.mlp.c_fc.bias
visual.transformer.resblocks.21.mlp.c_proj.weight
visual.transformer.resblocks.21.mlp.c_proj.bias
visual.transformer.resblocks.21.ln_2.weight
visual.transformer.resblocks.21.ln_2.bias
visual.transformer.resblocks.22.attn.in_proj_weight
visual.transformer.resblocks.22.attn.in_proj_bias
visual.transformer.resblocks.22.attn.out_proj.weight
visual.transformer.resblocks.22.attn.out_proj.bias
visual.transformer.resblocks.22.ln_1.weight
visual.transformer.resblocks.22.ln_1.bias
visual.transformer.resblocks.22.mlp.c_fc.weight
visual.transformer.resblocks.22.mlp.c_fc.bias
visual.transformer.resblocks.22.mlp.c_proj.weight
visual.transformer.resblocks.22.mlp.c_proj.bias
visual.transformer.resblocks.22.ln_2.weight
visual.transformer.resblocks.22.ln_2.bias
visual.transformer.resblocks.23.attn.in_proj_weight
visual.transformer.resblocks.23.attn.in_proj_bias
visual.transformer.resblocks.23.attn.out_proj.weight
visual.transformer.resblocks.23.attn.out_proj.bias
visual.transformer.resblocks.23.ln_1.weight
visual.transformer.resblocks.23.ln_1.bias
visual.transformer.resblocks.23.mlp.c_fc.weight
visual.transformer.resblocks.23.mlp.c_fc.bias
visual.transformer.resblocks.23.mlp.c_proj.weight
visual.transformer.resblocks.23.mlp.c_proj.bias
visual.transformer.resblocks.23.ln_2.weight
visual.transformer.resblocks.23.ln_2.bias
visual.ln_post.weight
visual.ln_post.bias
visual.proj

#  Text correlation 
token_embedding.weight
positional_embedding
transformer.resblocks.0.attn.in_proj_weight
transformer.resblocks.0.attn.in_proj_bias
transformer.resblocks.0.attn.out_proj.weight
transformer.resblocks.0.attn.out_proj.bias
transformer.resblocks.0.ln_1.weight
transformer.resblocks.0.ln_1.bias
transformer.resblocks.0.mlp.c_fc.weight
transformer.resblocks.0.mlp.c_fc.bias
transformer.resblocks.0.mlp.c_proj.weight
transformer.resblocks.0.mlp.c_proj.bias
transformer.resblocks.0.ln_2.weight
transformer.resblocks.0.ln_2.bias
transformer.resblocks.1.attn.in_proj_weight
transformer.resblocks.1.attn.in_proj_bias
transformer.resblocks.1.attn.out_proj.weight
transformer.resblocks.1.attn.out_proj.bias
transformer.resblocks.1.ln_1.weight
transformer.resblocks.1.ln_1.bias
transformer.resblocks.1.mlp.c_fc.weight
transformer.resblocks.1.mlp.c_fc.bias
transformer.resblocks.1.mlp.c_proj.weight
transformer.resblocks.1.mlp.c_proj.bias
transformer.resblocks.1.ln_2.weight
transformer.resblocks.1.ln_2.bias
transformer.resblocks.2.attn.in_proj_weight
transformer.resblocks.2.attn.in_proj_bias
transformer.resblocks.2.attn.out_proj.weight
transformer.resblocks.2.attn.out_proj.bias
transformer.resblocks.2.ln_1.weight
transformer.resblocks.2.ln_1.bias
transformer.resblocks.2.mlp.c_fc.weight
transformer.resblocks.2.mlp.c_fc.bias
transformer.resblocks.2.mlp.c_proj.weight
transformer.resblocks.2.mlp.c_proj.bias
transformer.resblocks.2.ln_2.weight
transformer.resblocks.2.ln_2.bias
transformer.resblocks.3.attn.in_proj_weight
transformer.resblocks.3.attn.in_proj_bias
transformer.resblocks.3.attn.out_proj.weight
transformer.resblocks.3.attn.out_proj.bias
transformer.resblocks.3.ln_1.weight
transformer.resblocks.3.ln_1.bias
transformer.resblocks.3.mlp.c_fc.weight
transformer.resblocks.3.mlp.c_fc.bias
transformer.resblocks.3.mlp.c_proj.weight
transformer.resblocks.3.mlp.c_proj.bias
transformer.resblocks.3.ln_2.weight
transformer.resblocks.3.ln_2.bias
transformer.resblocks.4.attn.in_proj_weight
transformer.resblocks.4.attn.in_proj_bias
transformer.resblocks.4.attn.out_proj.weight
transformer.resblocks.4.attn.out_proj.bias
transformer.resblocks.4.ln_1.weight
transformer.resblocks.4.ln_1.bias
transformer.resblocks.4.mlp.c_fc.weight
transformer.resblocks.4.mlp.c_fc.bias
transformer.resblocks.4.mlp.c_proj.weight
transformer.resblocks.4.mlp.c_proj.bias
transformer.resblocks.4.ln_2.weight
transformer.resblocks.4.ln_2.bias
transformer.resblocks.5.attn.in_proj_weight
transformer.resblocks.5.attn.in_proj_bias
transformer.resblocks.5.attn.out_proj.weight
transformer.resblocks.5.attn.out_proj.bias
transformer.resblocks.5.ln_1.weight
transformer.resblocks.5.ln_1.bias
transformer.resblocks.5.mlp.c_fc.weight
transformer.resblocks.5.mlp.c_fc.bias
transformer.resblocks.5.mlp.c_proj.weight
transformer.resblocks.5.mlp.c_proj.bias
transformer.resblocks.5.ln_2.weight
transformer.resblocks.5.ln_2.bias
transformer.resblocks.6.attn.in_proj_weight
transformer.resblocks.6.attn.in_proj_bias
transformer.resblocks.6.attn.out_proj.weight
transformer.resblocks.6.attn.out_proj.bias
transformer.resblocks.6.ln_1.weight
transformer.resblocks.6.ln_1.bias
transformer.resblocks.6.mlp.c_fc.weight
transformer.resblocks.6.mlp.c_fc.bias
transformer.resblocks.6.mlp.c_proj.weight
transformer.resblocks.6.mlp.c_proj.bias
transformer.resblocks.6.ln_2.weight
transformer.resblocks.6.ln_2.bias
transformer.resblocks.7.attn.in_proj_weight
transformer.resblocks.7.attn.in_proj_bias
transformer.resblocks.7.attn.out_proj.weight
transformer.resblocks.7.attn.out_proj.bias
transformer.resblocks.7.ln_1.weight
transformer.resblocks.7.ln_1.bias
transformer.resblocks.7.mlp.c_fc.weight
transformer.resblocks.7.mlp.c_fc.bias
transformer.resblocks.7.mlp.c_proj.weight
transformer.resblocks.7.mlp.c_proj.bias
transformer.resblocks.7.ln_2.weight
transformer.resblocks.7.ln_2.bias
transformer.resblocks.8.attn.in_proj_weight
transformer.resblocks.8.attn.in_proj_bias
transformer.resblocks.8.attn.out_proj.weight
transformer.resblocks.8.attn.out_proj.bias
transformer.resblocks.8.ln_1.weight
transformer.resblocks.8.ln_1.bias
transformer.resblocks.8.mlp.c_fc.weight
transformer.resblocks.8.mlp.c_fc.bias
transformer.resblocks.8.mlp.c_proj.weight
transformer.resblocks.8.mlp.c_proj.bias
transformer.resblocks.8.ln_2.weight
transformer.resblocks.8.ln_2.bias
transformer.resblocks.9.attn.in_proj_weight
transformer.resblocks.9.attn.in_proj_bias
transformer.resblocks.9.attn.out_proj.weight
transformer.resblocks.9.attn.out_proj.bias
transformer.resblocks.9.ln_1.weight
transformer.resblocks.9.ln_1.bias
transformer.resblocks.9.mlp.c_fc.weight
transformer.resblocks.9.mlp.c_fc.bias
transformer.resblocks.9.mlp.c_proj.weight
transformer.resblocks.9.mlp.c_proj.bias
transformer.resblocks.9.ln_2.weight
transformer.resblocks.9.ln_2.bias
transformer.resblocks.10.attn.in_proj_weight
transformer.resblocks.10.attn.in_proj_bias
transformer.resblocks.10.attn.out_proj.weight
transformer.resblocks.10.attn.out_proj.bias
transformer.resblocks.10.ln_1.weight
transformer.resblocks.10.ln_1.bias
transformer.resblocks.10.mlp.c_fc.weight
transformer.resblocks.10.mlp.c_fc.bias
transformer.resblocks.10.mlp.c_proj.weight
transformer.resblocks.10.mlp.c_proj.bias
transformer.resblocks.10.ln_2.weight
transformer.resblocks.10.ln_2.bias
transformer.resblocks.11.attn.in_proj_weight
transformer.resblocks.11.attn.in_proj_bias
transformer.resblocks.11.attn.out_proj.weight
transformer.resblocks.11.attn.out_proj.bias
transformer.resblocks.11.ln_1.weight
transformer.resblocks.11.ln_1.bias
transformer.resblocks.11.mlp.c_fc.weight
transformer.resblocks.11.mlp.c_fc.bias
transformer.resblocks.11.mlp.c_proj.weight
transformer.resblocks.11.mlp.c_proj.bias
transformer.resblocks.11.ln_2.weight
transformer.resblocks.11.ln_2.bias
ln_final.weight
ln_final.bias
text_projection

Just optimize CLIP Parameters of the latter layers of

def get_optim_params(model_name: str):
    if model_name in ['ViT-B/32', 'ViT-B/16']:
        return ['visual.transformer.resblocks.11.attn.in_proj_weight',
                'visual.transformer.resblocks.11.attn.in_proj_bias',
                'visual.transformer.resblocks.11.attn.out_proj.weight',
                'visual.transformer.resblocks.11.attn.out_proj.bias',
                'visual.transformer.resblocks.11.ln_1.weight',
                'visual.transformer.resblocks.11.ln_1.bias',
                'visual.transformer.resblocks.11.mlp.c_fc.weight',
                'visual.transformer.resblocks.11.mlp.c_fc.bias',
                'visual.transformer.resblocks.11.mlp.c_proj.weight',
                'visual.transformer.resblocks.11.mlp.c_proj.bias',
                'visual.transformer.resblocks.11.ln_2.weight',
                'visual.transformer.resblocks.11.ln_2.bias',
                'visual.ln_post.weight',
                'visual.ln_post.bias',
                'visual.proj',
                'transformer.resblocks.11.attn.in_proj_weight',
                'transformer.resblocks.11.attn.in_proj_bias',
                'transformer.resblocks.11.attn.out_proj.weight',
                'transformer.resblocks.11.attn.out_proj.bias',
                'transformer.resblocks.11.ln_1.weight',
                'transformer.resblocks.11.ln_1.bias',
                'transformer.resblocks.11.mlp.c_fc.weight',
                'transformer.resblocks.11.mlp.c_fc.bias',
                'transformer.resblocks.11.mlp.c_proj.weight',
                'transformer.resblocks.11.mlp.c_proj.bias',
                'transformer.resblocks.11.ln_2.weight',
                'transformer.resblocks.11.ln_2.bias',
                'ln_final.weight',
                'ln_final.bias',
                'text_projection']
    elif model_name in ['ViT-L/14', 'ViT-L/[email protected]']:
        return ['visual.transformer.resblocks.23.attn.in_proj_weight',
                'visual.transformer.resblocks.23.attn.in_proj_bias',
                'visual.transformer.resblocks.23.attn.out_proj.weight',
                'visual.transformer.resblocks.23.attn.out_proj.bias',
                'visual.transformer.resblocks.23.ln_1.weight',
                'visual.transformer.resblocks.23.ln_1.bias',
                'visual.transformer.resblocks.23.mlp.c_fc.weight',
                'visual.transformer.resblocks.23.mlp.c_fc.bias',
                'visual.transformer.resblocks.23.mlp.c_proj.weight',
                'visual.transformer.resblocks.23.mlp.c_proj.bias',
                'visual.transformer.resblocks.23.ln_2.weight',
                'visual.transformer.resblocks.23.ln_2.bias',
                'visual.ln_post.weight',
                'visual.ln_post.bias',
                'visual.proj',
                'transformer.resblocks.11.attn.in_proj_weight',
                'transformer.resblocks.11.attn.in_proj_bias',
                'transformer.resblocks.11.attn.out_proj.weight',
                'transformer.resblocks.11.attn.out_proj.bias',
                'transformer.resblocks.11.ln_1.weight',
                'transformer.resblocks.11.ln_1.bias',
                'transformer.resblocks.11.mlp.c_fc.weight',
                'transformer.resblocks.11.mlp.c_fc.bias',
                'transformer.resblocks.11.mlp.c_proj.weight',
                'transformer.resblocks.11.mlp.c_proj.bias',
                'transformer.resblocks.11.ln_2.weight',
                'transformer.resblocks.11.ln_2.bias',
                'ln_final.weight',
                'ln_final.bias',
                'text_projection']
    else:
        print(f"no {
      model_name}")
import os
import clip
from torch import nn
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10
from torch.nn import functional as F
import torch


class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.model, self.preprocess = clip.load('ViT-B/32', 'cpu')

        optim_params = get_optim_params('ViT-B/32'):

        for name, param in self.model.named_parameters():
            if name not in optim_params:
                param.requires_grad = False

    def forward(self, image, text):
        image_features = self.model.encode_image(image)
        text_features = self.model.encode_text(text)
        return image_features, text_features


net = Net()
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2)

root = os.path.expanduser("~/.cache")
cifar10 = CIFAR10(root, download=True, train=True, transform=net.preprocess)
train = next(iter(DataLoader(cifar10, batch_size=8)))
images = train[0]
texts = torch.cat([clip.tokenize(f"a photo of a {
      cifar10.classes[c]}") for c in train[1]])

storeParam = {
    }
for name, param in net.named_parameters():
    storeParam[name] = param.detach().clone()

for i in range(10):
    image_features, text_features = net(images, texts)
    loss = F.mse_loss(image_features, text_features)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(loss.item())

for name, param in net.named_parameters():
    if not torch.equal(param, storeParam[name]):
        print(f"{
      name}")

原网站

版权声明
本文为[MallocLu]所创,转载请带上原文链接,感谢
https://yzsam.com/2022/161/202206101132289991.html